def main(): tfidf = TfIdf(corpus_filename="moviecorpus.txt") # tfidf.add_document_to_corpus() # print tfidf.term_freq # print tfidf.num_words for line in tfidf.get_summary('oblivion.txt', 5): print line
def createTFIDFTopics(self): self.db = psycopg2.connect("dbname=%s user=%s password=%s host=%s" % ( self.dbname, self.dbuser, self.dbpass, self.dbhost)) c = self.db.cursor() headlines = {} c.execute( "SELECT article_day,country,title,url,article_hash FROM articles_headlines") for row in c.fetchall(): title = row[2] # c.execute('SELECT content from articles where hash = ?',(row[4],)) # content = c.fetchone()[0] lista = headlines.get(str(row[0])+'-'+row[1]) if lista is None: # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content] headlines[str(row[0])+'-'+row[1]] = [title] else: # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content) headlines[str(row[0])+'-'+row[1]].append(title) self.db.close() for hd, contents in headlines.items(): print(f'>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> {hd}') with open('stopwords.txt', 'r') as st: tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()]) tfidf.parse(contents)
def createTFIDFTopics(self): self.db = sqlite3.connect(self.dbname, detect_types=sqlite3.PARSE_DECLTYPES) c = self.db.cursor() headlines = {} c.execute( "SELECT article_day,country,title,url,article_hash FROM articles_headlines" ) for row in c.fetchall(): title = row[2] # c.execute('SELECT content from articles where hash = ?',(row[4],)) # content = c.fetchone()[0] lista = headlines.get(str(row[0]) + '-' + row[1]) if lista is None: # headlines[str(row[0])+'-'+row[1]] = [title + ' ' + content] headlines[str(row[0]) + '-' + row[1]] = [title] else: # headlines[str(row[0])+'-'+row[1]].append(title + ' ' + content) headlines[str(row[0]) + '-' + row[1]].append(title) self.db.close() for hd, contents in headlines.iteritems(): print '>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ' + hd with open('stopwords.txt', 'r') as st: tfidf = TfIdf(stopwords=[x.strip() for x in st.readlines()]) tfidf.parse(contents)
def calcularfrecuencia(self, texto, palabra=[]): table = TfIdf() table.add_document("informacion", texto) resultado = table.similarities(palabra)[0][1] if resultado > 0.0: return True return False
def __init__(self, sql_obj=None): if not sql_obj: self.sql = SQLQuery() else: self.sql = sql_obj self.tfidf_obj = TfIdf() self.ids = None
def test_similarity(self): table = TfIdf() table.add_document("foo", ["a", "b", "c", "d", "e", "f", "g", "h"]) table.add_document("bar", ["a", "b", "c", "i", "j", "k"]) table.add_document("baz", ["k", "l", "m", "n"]) self.assertEqual( table.similarities(["a", "b", "c"]), [["foo", 0.6875], ["bar", 0.75], ["baz", 0.0]])
def test_provider(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) return pos_provider
def gen_extra_sentences(): word_idf_file = 'e:/el/tmpres/demo/merge/word_idf.txt' tfidf = TfIdf(word_idf_file) mesh_id_wid_file = 'e:/el/tmpres/demo/merge/mesh_id_wid.txt' merged_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions.txt' merged_tokenized_desc_file = 'e:/el/tmpres/demo/merge/merged_descriptions_tokenized.txt' extra_sentence_file = 'e:/el/tmpres/demo/merge/wiki_extra_sentences.txt' mesh_ids = list() wids = list() fin = open(mesh_id_wid_file, 'rb') for line in fin: vals = line.strip().split('\t') mesh_ids.append(vals[0]) wids.append(int(vals[1])) fin.close() fin_desc = open(merged_desc_file, 'rb') fin_token_desc = open(merged_tokenized_desc_file, 'rb') fout = open(extra_sentence_file, 'wb') for idx, (mesh_id, mesh_desc, mesh_token_desc) in enumerate( izip(mesh_ids, fin_desc, fin_token_desc)): mesh_token_desc = mesh_token_desc.strip() mesh_desc_words = mesh_token_desc.split(' ') mesh_sentence_ends = find_sentence_ends(mesh_desc_words) wiki_desc = fin_desc.next().strip() wiki_token_desc = fin_token_desc.next().strip() wiki_desc_words = wiki_token_desc.split(' ') wiki_sentence_ends = find_sentence_ends(wiki_desc_words) extra_sentence_indices = get_sentences_to_add(mesh_desc_words, mesh_sentence_ends, wiki_desc_words, wiki_sentence_ends, tfidf) wiki_words_to_pos_list = tokenized_text_match(wiki_desc, wiki_desc_words) original_sentences = get_original_sentences(wiki_desc, wiki_words_to_pos_list, wiki_sentence_ends) fout.write('%s\t%d\n' % (mesh_id, len(extra_sentence_indices))) for j in extra_sentence_indices: fout.write('%s\n' % original_sentences[j]) # if idx == 10000: # break fin_desc.close() fin_token_desc.close() fout.close()
def build_tfidf_model(self, files): ''' It builds the Tf-Idf model :param files: List of files of the corpora :return: A Tf-Idf object with the model loaded ''' tfidf = TfIdf() for file_path in files: with open(file_path) as f: doc_name = file_path.split('/')[-1] doc_text = f.readline().split() tfidf.add_document(doc_name, doc_text) return tfidf
def main(args): summarizer = { 'tfidf': TfIdf(), 'cluster': Cluster(), 'svd': SVD(), 'pagerank': PageRank() }[args['alg']] summarizer.initialize(args['tf'], args['df']) summary = summarizer.summarize(args['doc']) for s in summary: print(s),
def test_tfidf(self): clean_tmp() t = TfIdf(self.data, root_dir) self.assertTrue(t.idf_cache['I'] < t.idf_cache['hello']) self.assertTrue(t.idf_cache['I'] < t.idf_cache['You']) self.assertTrue(t.idf_cache['I'] < t.idf_cache['not exist feature'], "test default idf_default_val") result1 = t.tfidf_in_a_doc(self.data[1]) self.assertTrue(result1['I'] < result1['You']) self.assertTrue(result1['You'] < result1['hello']) self.assertTrue(result1['hello'] == result1['world']) clean_tmp()
def tfidf_matrix(text_generator): """Builds tf-idf matrix from records from fname, using fields to create a text describing them """ ti = TfIdf() #print "building tfidf indices" for i in text_generator: ti.add_input_document(i) A = np.zeros([ti.num_docs, len(ti.term_num_docs)]) for i_ind, i in enumerate(text_generator): #print "-i_ind, i:", i_ind, i for j_ind, j in enumerate(ti.get_tfidf(i)): #print "-----j_ind, j:", j_ind, j A[i_ind, j_ind] = j #print ti.term_num_docs return A, ti
def test_similarity(self): table = TfIdf() table.add_document("doc1", [ "The", "game", "of", "life", "is", "a", "game", "of", "everlasting", "learning" ]) table.add_document( "doc2", ["The", "unexamined", "life", "is", "not", "worth", "living"]) table.add_document("doc3", ["Never", "stop", "learning"]) table.calculate_tf() table.calculate_idf() table.calculate_tf_idf() """self.assertEqual( table.similarities(["life","learning"]), [["foo", 1.0], ["bar", 0.707106781], ["baz", 0.707106781]])""" print(table.similarities(["life", "learning"]))
def calculateTFIDFofNew(self, inputTitle, inputBody): title = self.textToWordsArray(inputTitle) sentences = self.textArrayToWordsArray(inputBody) if len(sentences) < 1: return [] table = TfIdf() for i in range(0, len(sentences)): table.add_document("sentences" + str(i), sentences[i]) result = [] similarities = table.similarities(title) for similarity in similarities: result.append(similarity[1]) resLen = len(result) for i in range(resLen, 5): result.append(0) return result
def train_setup(vocab_file, pos_file, neg_file, cluster_labels_file, validation_file): vocab = load_pickled(vocab_file) tfidf = TfIdf(vocab, [pos_file, neg_file]) label_vectorizer = LabelVectorizer(load_pickled(cluster_labels_file)) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider(pos_file, 1, vocab, tfidf, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider(neg_file, -1, vocab, tfidf, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider(validation_file, None, vocab, tfidf, label_vectorizer, stemmer) return merged, validation_provider
def train_setup(): vocab = load_pickled('vocab.dat') tfidf = TfIdf(vocab, ['./data/train_pos.txt', './data/train_neg.txt']) lda = LdaLoader('topics.lda', 200) label_vectorizer = LabelVectorizer(load_pickled('labels.dat')) stemmer = MemoizedStemmer() pos_provider = TrainingSampleProvider('./data/train_pos.txt', 1, vocab, tfidf, lda, label_vectorizer, stemmer) neg_provider = TrainingSampleProvider('./data/train_neg.txt', -1, vocab, tfidf, lda, label_vectorizer, stemmer) merged = SampleMerger(pos_provider, neg_provider) validation_provider = ValidationSampleProvider('./data/test_data.txt', None, vocab, tfidf, lda, label_vectorizer, stemmer) return merged, validation_provider
class CleanedTfIdf(TfIdf): """overrides the default TfIdf class to only produces correctly spelled english language words. instanciated here to prevent nltk dependancy unless needed""" def get_tokens(self, str): """overrides the default tokenizer to only used english language correctly spelled words Break a string into tokens, preserving URL tags as an entire token. This implementation does not preserve case. """ raw_tokens = re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower()) return [t for t in raw_tokens if wordnet.synsets(t)] comment_model = CleanedTfIdf(stopword_filename="stopwords.txt", DEFAULT_IDF=None) else: comment_model = TfIdf(stopword_filename="stopwords.txt", DEFAULT_IDF=None) if "--in-mem" in args or "-m" in args: #pop the arg if "--in-mem" in args: args.pop(args.index("--in-mem")) if "-m" in args: args.pop(args.index("-m")) print "shifting db to memory" # Read database to tempfile tempfile = StringIO.StringIO() for line in conn.iterdump(): tempfile.write(u'{0}\n'.format(line)) conn.close() tempfile.seek(0)
def __init__(self): self.tfidf = TfIdf()
except IndexError: save_file = 'pickled_tfidf.pickle' print "saving to ", save_file try: with open(save_file) as rh: top_100 = cPickle.load(rh) except IOError: top_100 = {} print "proceeding with", len(top_100), "previous tfidf docs" with open(save_file, 'w') as wh: wh.write('0\n') comment_model = TfIdf(corpus_filename="idf_model_filteredsorted.txt", stopword_filename="curated_stopwords.txt", DEFAULT_IDF=0.0000001) #if not in idf model, give very low score, since model is filtered #find the number of beers for progress indication c.execute("SELECT id from beer") total_beers = len(list(c.fetchall())) print "calculating tfidf of ", total_beers, "beers." c.execute("SELECT id, name FROM beer") idx = 0 #don't want to unwrap the generator so we'll idex this way worked = 0 for beer_id, name in c.fetchall(): if idx%1000 == 0: print """*-*-*-* Finished {0}% of the processing.""".format(float(idx)/total_beers) with open(save_file, 'w') as wh: cPickle.dump(top_100, wh) idx += 1
from tfidf import TfIdf import pandas as pd corpuspath = '/Users/goksukara/Desktop/Projects/EclipseWorkspace/Specilization/PhytonCode/Data/' if __name__ == "__main__": Tf_idf = TfIdf(corpuspath + 'Gensim_output') Tf_idf.loaddictionary() Tf_idf.buildmodel() Tf_idf.saveModel() Tf_idf.getTF_IDF() #print(Tf_idf.corpus_dict) #Tf_idf.listnhighIdfs(4)
def __init__(self): pio.renderers.default = 'browser' tfidf = TfIdf() self.ids, self.titles, self.matrix = tfidf.get_matrix() self.vectorizer = tfidf.get_vectorizer()
def menu(): print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") word = 0 nword = 0 narray = [] j = 0 for line in fileinput.input(): if line.replace("\n", "") == "1": os.system('python3 web_scraper.py') print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "2") or (word > 0): if word == 0 and j == 0: if (os.path.isdir("artigos") == False): print( 'Necessita de gerar primeiro o conteúdo. Escolha a opção 1' ) print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") else: filesA = os.listdir('artigos') table = TfIdf() for i in filesA: with open('artigos/{}'.format(i), 'r') as content: #print(content.read().split('h2')) val = content.read().split('h2') firstVal = val[0] secondVal = val[1] table.add_document( 'title{}'.format(i), re.sub(r'[\W]', ' ', firstVal).lower().split()) table.add_document( 'text{}'.format(i), re.sub(r'[\W]', ' ', secondVal).lower().split()) word += 1 print('Indique quantas palavras quer comparar:') elif (word == 1) and (j == 0): if (line.replace("\n", "").isnumeric() and int(line) > 1): nword = int(line) word += 1 else: print('Digite um número maior que 1') elif (word > 1) and (word <= nword) and (j == 0): if (line.replace("\n", "") != ''): narray.append(line.replace("\n", "").lower()) word += 1 else: j = 1 if (j == 1): if line.replace("\n", "") != '': narray.append(line.replace("\n", "").lower()) j += 1 if (j == 2): print(narray) fTDIDF = open('output' + narray[0] + '.html', 'w+') fTDIDF.write( '<h2>Resultados da aplicação do algoritmo:<h2>') splitArray = {} for s in table.similarities(narray): if s[0].startswith('title'): s[0] = s[0].replace('title', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.7 else: splitArray[s[0]] = s[1] * 0.7 elif s[0].startswith('text'): s[0] = s[0].replace('text', '') if s[0] in splitArray.keys(): splitArray[s[0]] += s[1] * 0.3 else: splitArray[s[0]] = s[1] * 0.3 for elem in splitArray.keys(): fTDIDF.write( '<p><h5><a href="artigos/{}" >'.format(elem) + elem + '</a> -> ' + str(splitArray[elem]) + '</h5></p>') new = 2 # open in a new tab, if possible url = "file:///home/ze/SPLN/WebScraper/output" + narray[ 0] + ".html" webbrowser.open(url, new=new) word = 0 nword = 0 narray = [] print("Que deseja fazer?") print("1 - Consultar a informação do site do jornal ABola") print("2 - Aplicar o algoritmo do TFIDF") print("3 - Sair") elif (line.replace("\n", "") == "3") and (word == 0): print("Obrigado pela sua visita") fileinput.close()
def idf_cache(self): """ 训练+测试 预料全在里面了 """ result = TfIdf(self.documents_with_segments, self.cache_dir).idf_cache self.idf_file = result # 采用自己的IDF,给FeaturesWeight用 return result
# ] q = 'gold' dictOf = {i: document[i] for i in range(0, len(document))} print('+----------------------------------+') print('documents : ') pprint.pprint(dictOf) print('') print('query :' + q) print('+----------------------------------+') print('') print('Pembobotan TF-IDF') tfidf = TfIdf().transform(q=q, document=document) print("Bobot rata-rata: " + str(tfidf.weight_average())) pprint.pprint(tfidf.get_weight()) print("+---------------------------------+") print('') print('Pembobotan W-IDF') widf = WIdf().transform(q=q, document=document) print("Bobot rata-rata: " + str(widf.weight_average())) pprint.pprint(widf.get_weight()) print("+---------------------------------+") print('') print('Pembobotan TFRF') tfrf = TFRF().transform(q=q, document=document) print("Bobot rata-rata: " + str(tfrf.weight_average()))
except IndexError: if (len(results) == 0): print("No results were found for this query.") exit() else: pass for posting in list_of_postings: results = set(results).intersection(posting) if (len(important) > 0): for res in results: important.append(res) #vectorizer = TfidfVectorizer() table = TfIdf() G = nx.Graph() #return urls corresponding to numbers with open("url_files.csv") as f: urls = [row for row in csv.reader(f)] if len(results) != 0: for x in results: f = open(urls[x - 1][0]) obj = json.load(f) soup = BeautifulSoup(obj["content"], "html.parser", from_encoding="iso-8859-1") joinedText = [
def extract_keywords(db, tokens): dfs = db.get_dfs() tfidf = TfIdf(dfs) return tfidf.new_keywords(tokens)
print("Getting data from " + url.strip() + "...", end="", flush=True) response = get(url=url) print("done!\nParsing HTML data...", end="", flush=True) parser.feed(response.text) print("done!") keydict = parser.get_keydict() urldata = {"url": url, "keywords": keydict} id_md5 = hashlib.md5(url.encode()).hexdigest() docs[id_md5] = urldata ti = TfIdf(docs) for kd, d in docs.items(): print("Processing document " + kd + "...", end="", flush=True) for kw, t in d['keywords'].items(): docs[kd]['keywords'][kw]['tf_idf'] = ti.tf_idf(kw, kd) print("done!") fname = 'webdirectory.txt' print("Saving to file " + fname + "...", end="", flush=True) with open(fname, 'w') as file: file.write(json.dumps(docs, sort_keys=False)) print("done!\nCompleted!")
def setUp(self): self.unk_cutoff = 2 self.vocab = TfIdf(unk_cutoff=self.unk_cutoff)
''' Created on 15 Jul 2018 @author: goksukara ''' from tfidf import TfIdf def addallfilesinpath(path): pass if __name__ == "__main__": Tf_idf = TfIdf('s') list=[['human', 'human', 'interface'],['ship', 'human', 'interface']] list1=[['ship', 'humasn', 'interface']] list2=[['human', 'human', 'am']] list3=[['humafn', 'humasn', 'am1']] #map(unicode,list) Tf_idf.add_document(list) Tf_idf.add_document(list1) Tf_idf.Saverelatedwords() Tf_idf.add_document(list2) Tf_idf.add_document(list3) Tf_idf.SaveCorpusdic() Tf_idf.loaddictionary() Tf_idf.buildmodel() #Tf_idf.listnhighIdfs(10) Tf_idf.getTF_IDF()