Exemple #1
0
def extract_terms(source_file=source_dataset,
                  method='cval',
                  out_file=out_file_cval):
    with open(source_file, "r") as sf:
        dataset = sf.read()

    dataset = re.sub(r'et +al\.', 'et al', dataset)
    dataset = unicode(dataset, "utf-8", errors='ignore')
    dataset = re.split(r'[\r\n]', dataset)

    term_extractor = ate.TermExtractor(stopwords=stopwords,
                                       term_patterns=term_patterns,
                                       min_term_words=min_term_words,
                                       min_term_length=min_term_length)
    if method == 'cval':
        terms = term_extractor.extract_terms(dataset)
        terms_freqs = term_extractor.c_values(terms, trace=False)
        with open(out_file, 'wb') as csvfile:
            termwriter = csv.writer(csvfile,
                                    delimiter=';',
                                    quotechar='',
                                    quoting=csv.QUOTE_NONE)
            for cv in terms_freqs:
                termwriter.writerow(cv)
    else:
        terms_freqs = terms_freq()
        with open(out_file, 'wb') as csv_file:
            writer = csv.writer(csv_file)
            for key, value in terms_freqs:
                writer.writerow([key, value])

    return terms_freqs
def tf(text):
    term_extractor = ate.TermExtractor(stopwords=stopwords, term_patterns=term_patterns, min_term_words=min_term_words,
                                       min_term_length=min_term_length)
    terms = term_extractor.extract_terms(text)
    print len(terms)
    # print type(terms[0])
    map = {}
    max = 1
    for term in terms:
        if term in map.keys():
            map[term] += 1
            if map[term]>max:
                max = map[term]
        else:
            map[term] = 1
    print(len(map))
    for term in map.keys():
        map[term] = 0.5 + 0.5*map[term]/max
    df = pd.DataFrame({'term': map.keys(), 'num': map.values()})
    return df.sort_values(['num'], ascending=False)
Exemple #3
0
term_patterns = json.loads(config.get('main', 'term_patterns'))
doc_file = config.get('main', 'doc_file')
out_file = config.get('main', 'out_file')

fp = open(doc_file, "r")
doc_txt = fp.read()
fp.close()
doc_txt = unicode(doc_txt, "utf-8", errors='ignore')
doc_txt = re.sub(r'et +al\.', 'et al', doc_txt)
doc_txt = re.split(r'[\r\n]', doc_txt)


def tf_idf(terms):
    print terms


term_extractor = ate.TermExtractor(stopwords=stopwords,
                                   term_patterns=term_patterns,
                                   min_term_words=min_term_words,
                                   min_term_length=min_term_length)
terms = term_extractor.extract_terms(doc_txt)
c_values = term_extractor.c_values(terms, trace=True)

with open(out_file, 'wb') as csvfile:
    termwriter = csv.writer(csvfile,
                            delimiter=';',
                            quotechar='',
                            quoting=csv.QUOTE_NONE)
    for cv in c_values:
        termwriter.writerow(cv)