def extract_terms(source_file=source_dataset, method='cval', out_file=out_file_cval): with open(source_file, "r") as sf: dataset = sf.read() dataset = re.sub(r'et +al\.', 'et al', dataset) dataset = unicode(dataset, "utf-8", errors='ignore') dataset = re.split(r'[\r\n]', dataset) term_extractor = ate.TermExtractor(stopwords=stopwords, term_patterns=term_patterns, min_term_words=min_term_words, min_term_length=min_term_length) if method == 'cval': terms = term_extractor.extract_terms(dataset) terms_freqs = term_extractor.c_values(terms, trace=False) with open(out_file, 'wb') as csvfile: termwriter = csv.writer(csvfile, delimiter=';', quotechar='', quoting=csv.QUOTE_NONE) for cv in terms_freqs: termwriter.writerow(cv) else: terms_freqs = terms_freq() with open(out_file, 'wb') as csv_file: writer = csv.writer(csv_file) for key, value in terms_freqs: writer.writerow([key, value]) return terms_freqs
def tf(text): term_extractor = ate.TermExtractor(stopwords=stopwords, term_patterns=term_patterns, min_term_words=min_term_words, min_term_length=min_term_length) terms = term_extractor.extract_terms(text) print len(terms) # print type(terms[0]) map = {} max = 1 for term in terms: if term in map.keys(): map[term] += 1 if map[term]>max: max = map[term] else: map[term] = 1 print(len(map)) for term in map.keys(): map[term] = 0.5 + 0.5*map[term]/max df = pd.DataFrame({'term': map.keys(), 'num': map.values()}) return df.sort_values(['num'], ascending=False)
term_patterns = json.loads(config.get('main', 'term_patterns')) doc_file = config.get('main', 'doc_file') out_file = config.get('main', 'out_file') fp = open(doc_file, "r") doc_txt = fp.read() fp.close() doc_txt = unicode(doc_txt, "utf-8", errors='ignore') doc_txt = re.sub(r'et +al\.', 'et al', doc_txt) doc_txt = re.split(r'[\r\n]', doc_txt) def tf_idf(terms): print terms term_extractor = ate.TermExtractor(stopwords=stopwords, term_patterns=term_patterns, min_term_words=min_term_words, min_term_length=min_term_length) terms = term_extractor.extract_terms(doc_txt) c_values = term_extractor.c_values(terms, trace=True) with open(out_file, 'wb') as csvfile: termwriter = csv.writer(csvfile, delimiter=';', quotechar='', quoting=csv.QUOTE_NONE) for cv in c_values: termwriter.writerow(cv)