Ejemplo n.º 1
0
def parseFolder( dirPath ):
    assignments = {}
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for pid in finalReader.fileids():
        final = finalReader.paras( pid ) #finalIdsSortedList[i] )
        draft = draftReader.paras( pid ) #draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments[pid] = assn

    return assignments
Ejemplo n.º 2
0
def parseFolder( dirPath ):
    assignments = []
    draftReader = PlaintextCorpusReader(dirPath, '\d+draft\d*.*')
    finalReader = PlaintextCorpusReader(dirPath, '\d+final\d*.*')

    numFiles = len( os.listdir( dirPath ))
    assert numFiles % 2 == 0

    finalIdsSortedList = finalReader.fileids()
    draftIdsSortedList = draftReader.fileids()

    for i in range(len(finalReader.fileids())):
        final = finalReader.paras( finalIdsSortedList[i] )
        draft = draftReader.paras( draftIdsSortedList[i] )
        assn = assignment( draft, final )
        assignments.append( assn )

    return assignments
Ejemplo n.º 3
0
 def token_in_coverage(self):
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     resumes = [[item for sent in paras for item in sent] for paras in corpa.paras()]
     cpt=0
     for resume in resumes :
         resume_text = " ".join(resume)
         resume_sents = nltk.sent_tokenize(resume_text)
         resume_words = set(token.lemma_ for sent in resume_sents for token in nlp(" ".join(sent).lower()))
         if not resume_words.isdisjoint(self.tokens_in) :
             cpt+=1
     coverage = cpt*1.0/len(resumes)
     print("token_in coverage : {}".format(coverage))
Ejemplo n.º 4
0
def processFile(newCorpusDir):
    if not os.path.isdir(newCorpusDir):
        os.mkdir(newCorpusDir)
    txt1 = getText('sample_feed.txt')
    txt2 = pdf.getTextPDF('VirtualBoxTroubleshooting.pdf')
    txt3 = word.getTextWord('my_doc.docx')

    files = [txt1, txt2, txt3]
    for idx, f in enumerate(files):
        with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
            fout.write(f)

    newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

    print(newCorpus.words())
    print(newCorpus.sents(newCorpus.fileids()[1]))
    print(newCorpus.paras(newCorpus.fileids()[0]))
def Read_corpus(path_c, fname_c, fo1):
    import nltk
    import re
    import spacy
    import en_core_web_sm
    import fileinput
    nlp = spacy.load('en_core_web_sm')
    from nltk.corpus.reader.plaintext import PlaintextCorpusReader

    pcorpus = PlaintextCorpusReader(path_c, fname_c, encoding="utf")

    #HTML Tags to file
    fappend(fo1, P_htmltag.writehtmltag1(fname_c), fname_c)

    # Iterate through each paragraph
    for para in pcorpus.paras():
        L0 = rep_tags(para)
        L1 = L0.split("\n")
        for i, w in enumerate(L1):
            if (w != ""):
                ApplyNLP(nlp(str(w[1:])), fo1)

    fappend(fo1, P_htmltag.writehtmltag3(fname_c), fname_c)
Ejemplo n.º 6
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))
import os
import word, pdf
from nltk.corpus.reader.plaintext import PlaintextCorpusReader


def getText(txtFileName):
    file = open(txtFileName, 'r')
    return file.read()


newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):
    os.mkdir(newCorpusDir)

txt1 = getText('sample_feed.txt')
txt2 = pdf.getTextPDF('sample-pdf.pdf')
txt3 = word.getTextWord('sample-one-line.docx')

files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

print(newCorpus.words())
print(newCorpus.sents(newCorpus.fileids()[1]))
print(newCorpus.paras(newCorpus.fileids()[0]))
Ejemplo n.º 8
0
    with newcorpus.open(infile) as fin:  # Opens the file.
        print fin.read().strip()  # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()
Ejemplo n.º 9
0
class TextAnalizer:
    def __init__(self, my_input_file):
        self.config = configparser.ConfigParser()
        self.config.read("text_analysis.cfg")
        self.input_file = my_input_file
        self.nlp_model = self.config["DEFAULT"]["nlp_model"]
        #The output file name
        self.output_file = self.config["DEFAULT"]["output_file"]
        self.nlp = load_nlp(self.nlp_model)
        self.corpus = CorpusReader(".", self.input_file)
        self.raw_text = self.corpus.raw()
        self.nlp_text = self.nlp(self.raw_text)
        # Here, lets put together the infos for text analysis with spacy.
        self.analysis_dictionary = Counter()
        self.word_count = 0
        self.get_word_count_nltk()

    def get_paragraph(self):
        return self.corpus.paras()

    def get_sentence(self):
        return self.corpus.sents()

    def get_word(self):
        return self.corpus.words()

    def get_word_count_nltk(self):
        tokenizer = Tokenizer(r'\w+')
        counts = Counter()
        sentences = self.get_sentence()
        for sentence in sentences:
            tokens = tokenizer.tokenize(" ".join(sentence))
            self.word_count = self.word_count + len(tokens)
            filtered = [w for w in sentence if w.isalnum()]
            counts = counts + Counter(filtered)
        return counts, self.word_count

    def analize_nlp(self):
        analized_data_str = (self.config["ANALIZED"]["POS"])
        analized_data = (analized_data_str.split(","))
        result_dict = {}
        diff_str, tot_str = (
            self.config["DEFAULT"]["diff_tot_string"]).split(",")
        lemma_counter = Counter()
        pos_counter = Counter()
        tag_counter = Counter()

        for token in self.nlp_text:
            lemma_counter = lemma_counter + Counter([token.lemma_])
            pos_counter = pos_counter + Counter([token.pos_])
            tag_counter = tag_counter + Counter([token.tag_])
            my_key = token.lemma_ + "_" + token.tag_ + "_" + token.pos_
            self.analysis_dictionary[my_key] += 1
        for pos in analized_data:
            instance_counter = 0
            total_counter = 0
            for key in self.analysis_dictionary.keys():
                try:
                    my_lemma, my_tag, my_pos = key.split("_")
                except ValueError:
                    print("Warning: Array has a empty line")  # add logging
                if pos == my_pos:
                    instance_counter += 1
                    total_counter = total_counter + self.analysis_dictionary.get(
                        key)
            result_dict[pos + diff_str] = instance_counter
            result_dict[pos + tot_str] = total_counter
        #add the stuff from nltk
        diff_word, word_count = self.get_word_count_nltk()
        result_dict["WORDS" + tot_str] = word_count
        result_dict["WORDS" + diff_str] = len(diff_word)
        result_dict["PARAGRAPHS"] = len(self.get_paragraph())
        result_dict["SENTENCES"] = len(self.get_sentence())

        return result_dict

    def write_output(self):
        with open(self.output_file, "w+") as f:
            f.write("Number of paragraphes: " +
                    str(len(self.get_paragraph())) + "\n")
            f.write("Number of sentences: " + str(len(self.get_sentence())) +
                    "\n")
            f.write("Number of words: " + str(self.word_count) + "\n")
            f.write("Average words per sentence: " +
                    str(round(self.word_count / len(self.get_sentence()), 2)) +
                    "\n")
            f.write("Number of different words: " +
                    str(len(self.get_word_count_nltk())) + "\n")
            f.write("Text variety (different words/total words: " + str(
                round(len(self.get_word_count_nltk()) / self.word_count, 2)) +
                    "\n")
            f.close()
    return file.read()


# 말뭉치 폴더 생성
newCorpusDir = 'mycorpus/'
if not os.path.isdir(newCorpusDir):  # 말뭉치 폴더가 이미 존재하는가?
    os.mkdir(newCorpusDir)

# 파일 읽기
# 일반 텍스트 파일
txt1 = getText('./Files/sample_feed.txt')
# PDF 파일
txt2 = pdf.getTextPDF('./Files/sample-pdf.pdf')
# DOCX 파일
txt3 = word.getTextWord('./Files/sample-one-line.docx')

# 파일 쓰기
files = [txt1, txt2, txt3]
for idx, f in enumerate(files):
    with open(newCorpusDir + str(idx) + '.txt', 'w') as fout:
        fout.write(f)

# 사용자 정의 말뭉치 만들기
# 폴더 내의 모든 파일을 읽어와 파일들로부터 말뭉치를 생성한다
newCorpus = PlaintextCorpusReader(newCorpusDir, '.*')

# 사용자 정의 말뭉치가 잘 만들어 졌는지 확인
print(newCorpus.words())  # 말뭉치의 모든 단어를 포함하는 배열
print(newCorpus.sents(newCorpus.fileids()[1]))  # 1.txt에 있는 모든 문장 배열을 출력
print(newCorpus.paras(newCorpus.fileids()[0]))  # 0.txt에 있는 모든 단락 배열을 출력
Ejemplo n.º 11
0
#Filname of text file to analyze
input_file = config["DEFAULT"]["input_file"]
#The nlp model used
nlp_model = config["DEFAULT"]["nlp_model"]
#The output file name
output_file = config["DEFAULT"]["output_file"]

#Load the nlp model
nlp = load_nlp(nlp_model)

#This Section generates a corpus (for nltk) and a string-text (for spacy)
corpus = CorpusReader(".", input_file)
my_text = corpus.raw()

#This section deals with nltk-stuff for analysis
paragraphs = corpus.paras()
sentences = corpus.sents()
words = corpus.words()

tokenizer = Tokenizer(r'\w+')
word_count = 0
counts = Counter()

for sentence in sentences:
    tokens = tokenizer.tokenize(" ".join(sentence))
    word_count = word_count + len(tokens)
    filtered = [w for w in sentence if w.isalnum()]
    counts = counts + Counter(filtered)

nlp_text = nlp(my_text)
Ejemplo n.º 12
0
def try_out_some_functionalities():

    corpusdir ="/media/benzro/OS/Users/benzro/Desktop/Studium Uni/2)" \
           "ZweitesSemester/27)PCL-2/Uebungen/Uebung03/Enron/test/"
    newcorpus = PCR(corpusdir, '.*')

    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access one file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    infile = corpusdir + "0001.1999-12-10.farmer.ham.txt"
    infile = "0004.1999-12-14.farmer.ham.txt"
    fin = newcorpus.open(infile)
    print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "all file ids"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.fileids()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access each file in the corpus"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # (reduced output: [0:2])
    for infile in sorted(newcorpus.fileids()):
        # the fileids of each file
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
        print infile
        # opens the file
        fin = newcorpus.open(infile)
        # prints the content of the file
        print fin.read().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "access the plaintext; outputs pure string of all files"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.raw().strip()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access paragraphs in the corpus. (list of list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and
    #       nltk.tokenize.word_tokenize.
    #
    # Each element in the outermost list is a paragraph, and
    # Each paragraph contains sentence(s), and
    # Each sentence contains token(s)
    print newcorpus.paras()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access pargraphs of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.paras(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access sentences in the corpus. (list of list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    # NOTE: That the texts are flattened into sentences that contains tokens.
    print newcorpus.sents()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access sentences of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.sents(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "Access just tokens/words in the corpus. (list of strings)"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words()
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print "To access tokens of a specific fileid."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    print newcorpus.words(newcorpus.fileids()[0])
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
Ejemplo n.º 13
0
    fin = newcorpus.open(infile)# Opens the file.
    print fin.read().strip() # Prints the content of the file
print

# Access the plaintext; outputs pure string/basestring.
print newcorpus.raw().strip()
print 

# Access paragraphs in the corpus. (list of list of list of strings)
# NOTE: NLTK automatically calls nltk.tokenize.sent_tokenize and 
#       nltk.tokenize.word_tokenize.
#
# Each element in the outermost list is a paragraph, and
# Each paragraph contains sentence(s), and
# Each sentence contains token(s)
print newcorpus.paras()
print

# To access pargraphs of a specific fileid.
print newcorpus.paras(newcorpus.fileids()[0])

# Access sentences in the corpus. (list of list of strings)
# NOTE: That the texts are flattened into sentences that contains tokens.
print newcorpus.sents()
print

# To access sentences of a specific fileid.
print newcorpus.sents(newcorpus.fileids()[0])

# Access just tokens/words in the corpus. (list of strings)
print newcorpus.words()
Ejemplo n.º 14
0
 def build_d2v_model(self):
     print("Début de la construction du modèle Doc2Vec")
     corpusdir = 'data/cv_corpus'
     corpa = PlaintextCorpusReader(corpusdir, '.*',encoding='windows-1252')
     print("tokenizing...")
     resumes = [[token.lemma_  for sent in paras for token in nlp(" ".join(self.clean(sent)).lower()) if token.lemma_ not in stopset] for paras in  corpa.paras()]
     #print(resumes[0:3])
     print("tokenization completed")
     documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(resumes)]
     model = Doc2Vec(documents, vector_size=self.cv_length, window=5, min_count=1, workers=4)
     print("Fin de la construction du modèle Doc2Vec")
     return model