def predict(n, url): article = Article(url, language="en") article.download() article.parse() article.nlp() content = pre_process_article(str(article.text)) title = pre_process_article(str(article.title)) tf_idf(content, title, n)
def runEngine(tokenizer_type, address): print("RUNNING ENGINE ...") inverted_index = load_obj("INVERTED INDEX " + tokenizer_type + "_" + address) tfidfs = load_obj("CHAMPIONS " + tokenize_type + "_" + address) query = "" while True: query = input("QUERY :>") if query == "!q": break queryToken = query.split(" ") queryw = tf_idf(queryToken, inverted_index, False) h = Heap() for i in range(len(tfidfs)): if bool(set(queryToken) & set(tfidfs[i].keys())): sim = querySimilarity(queryw, tfidfs[i]) if sim != 0: h.addnSort([i, sim]) k = 10 result = h.getFirstK(k) titles = fetch_column(address, 'title') for i in range(k): print(titles[result[i][0]][::-1])
def getChampions(tokens, inverted_index): tfidfs = [] for i in range(len(tokens)): tfidfs.append(tf_idf(tokens[i], inverted_index, False)) champions_term = {} champions_list = {} # TODO: Optimize this mess for term in inverted_index: champions_term[term] = [None] * inverted_index[term][0] for i in range(0, inverted_index[term][0]): champions_term[term][i] = tfidfs[inverted_index[term][i + 1]][term] for term in champions_term: y = list( zip(*heapq.nlargest(10, enumerate(champions_term[term]), key=operator.itemgetter(1))))[0] champions_list[term] = list(y) for term in champions_list: l = min(10, len(champions_list[term])) for i in range(l): champions_list[term][i] = inverted_index[term][ champions_list[term][i] + 1] return champions_list
def make_tf_matrix(self, doc_list): tf_matrix = [] for doc in doc_list: row = [0] * self.count for word in doc.split(' '): row[self.index[word]] = tfidf.tf_idf(word, doc, doc_list) tf_matrix.append(row) return tf_matrix
def generate_classifier_model(tokenized_sentences, features): model = [] import tfidf tfidf.cache_enabled(True) for sentence in tokenized_sentences: tweet_model = {} for i, feature in enumerate(features): tfidfv = tfidf.tf_idf(feature, sentence, tokenized_sentences) if tfidfv > 0: tweet_model[i] = tfidfv model.append(tweet_model) return model
def compute_document_score_by_term(term, tags=None): # documents = find_documents(term) scores = tfidf.tf_idf(term, tags) # start_time = time.time() # res = tfidf.terms_collection.aggregate([ # {'$match': {'term': term}}, # { # '$group': { # '_id': "$doc", # 'count': {'$sum': 1} # } # } # ]) # for doc in res: # scores[doc['_id']] = tfidf.tf_idf(term, doc['_id'], doc['count']) # print "Find Score Time = ", str((time.time() - start_time)) # for doc in documents: # scores[doc] = tfidf.tf_idf(term, doc) return scores
def callback(self): print("GOT TOKEN ", self.text.get("1.0", END)) queryToken = self.text.get("1.0", END).replace("\n", "").split(" ") queryToken = normalize_query(queryToken) print(queryToken) queryw = tf_idf(queryToken, self.inverted_index, False) h = Heap() for i in range(len(self.tfidfs)): if bool(set(queryToken) & set(self.tfidfs[i].keys())): sim = querySimilarity(queryw, self.tfidfs[i]) if sim != 0: h.addnSort([i, sim]) k = 10 result = h.getFirstK(k) k = min(len(result), k) print(k) for i in range(k): print(self.titles[result[i][0]]) self.mylist.delete(i) self.mylist.insert(i, self.titles[result[i][0]])
def get_bag_of_words_labels(preprocessed_records, args): """Gets the labels for the bag of words. A label can be a a single important word, a collocation of two important words, or a set of synonyms of a word. Params: - preprocessed_records (pyspark.rdd.RDD): The tokenized, lemmatized, lowercase records - ars (argparse.Namespace): The command-line arguments passed to the program Returns: - bag_of_words_labels (list<str|tuple<str>>): The labels of the bag of words created """ reformatted_records = preprocessed_records.map(lambda record: (record['id'], record['preprocessed_record'])) frequent_collocations = wordcount.extract_collocations(reformatted_records, args.num_collocations, args.collocation_window) tf_idf_scores = tfidf.tf_idf(reformatted_records) # Pyspark technically ends here - the rest is processed on master node important_words = tfidf.extract_important_words(tf_idf_scores, args.num_words, False) # important_words_with_counts = synsets.add_word_counts(important_words, frequent_words) synset_dict = synsets.generate_syn_set(important_words) words_and_collocations = wordcount.merge_collocations_with_wordlist(frequent_collocations, important_words) # Merge words, collocations and synsets bag_of_words_labels = list() for item in words_and_collocations: if " " in item: # item is a collocation bag_of_words_labels.append(item) elif item in synset_dict: # item is an important word synset = synset_dict[item] if len(synset) == 1: # synset only contains the word itself bag_of_words_labels.append(item) else: # synset contains multiple words synset = [word.encode('utf-8') for word in synset[1:]] bag_of_words_labels.append(synset) # Save bag of words labels to single text file with open("bag_of_words_labels.json", "w") as bow_file: json.dump(bag_of_words_labels, bow_file) return bag_of_words_labels
def main_(set_,v=None): data_pool = os.listdir(PATH+YEAR) #準備全樣本 s = sorted(list(set_)) #部分樣本代碼排序 res = [] if v is None: lst = [] #取開頭4個字元 轉成int 以便搜尋 for i in range(len(data_pool)): data_pool[i] = int(data_pool[i][0:4]) for key in s: res = binary_search(data_pool, key) #二分法搜尋 lst.append(res) #return index 【搜尋特定產業CSV】 for w in lst: write_txt(data_pool[w], YEAR) # #一次取2個進行比較 for idx in lst: for idx_ in lst: print(str(data_pool[idx])+"<--->"+str(data_pool[idx_])) s1 = get_txt(data_pool[idx]) s2 = get_txt(data_pool[idx_]) j1 = jieba_(s1) j2 = jieba_(s2) vector = tf_idf(j1,j2) cos = get_cos(vector[0], vector[1]) with open("化學"+YEAR+".txt", "a+" )as f: f.write(str(data_pool[idx])+"<--->"+str(data_pool[idx_])+" : "+ str(cos)+"\n")
import re import tfidf #import re #pattern = '[0-9]+[\t]+(.+)[\t]+[0-9]+' #p = re.compile(pattern) #m = p.search("\"1 theo walcott is still shit, watch rafa and johnny deal with him on saturday. 1\"") #content = m.group(1) #print(content) # test tf idf #i hava hava a dog #i like # i :1/2 ,have:2/2, a :1/2 dog:1/2 # i:1 like:1 #tf-idf i:0 have:1 a:1/2 dog:1/2 #i:0 like:1 res = tfidf.tf_idf([['i', 'have', 'have', 'a', 'dog'], ['i', 'like']]) tfidf.dump_tfidf_json(res, 'res.json')
raw_data, raw_labels = load_data_from_file() print( f'[main] Raw data dims - tweets: {len(raw_data)} / labels: {len(raw_labels)}' ) # Pre-process tweets & labels: corpus, labels = preprocess(raw_data, raw_labels, stemmer_flag) store_processed_data(corpus, labels) # save processed data to file print( f'[main] Processed data dims - corpus: {len(corpus)} / labels: {len(labels)}' ) # Compute TF-IDF or Graph of Words if method == 'TFIDF': X = tf_idf(corpus) elif method == 'GOW': X = gow(corpus) else: raise ValueError(f'[main] Invalid method: {method}') # Split the data X_train, X_test, y_train, y_test = train_test_split( X, labels, test_size=test_size, stratify=labels, random_state=RANDOM_STATE, ) print( f'[main] Training/test set dimensions: {len(y_train)}/{len(y_test)}\n')
algo = 'MLP' method = 'TFIDF' test_size = .2 estimator = get_classifier(algo) X, y = load_processed_data() # create multi-label like settings handles = get_handles() y = label_binarize(y, classes=[handles[0], handles[1], handles[2]]) n_classes = y.shape[1] # Compute TF-IDF or Graph of Words if method == 'TFIDF': X = tf_idf(X) elif method == 'GOW': X = gow(X) else: raise ValueError(f'[main] Invalid method: {method}') # Split into training and test X_train, X_test, Y_train, Y_test = train_test_split( X, y, test_size=test_size, random_state=RANDOM_STATE) X_train, X_test = normalize_X(X_train, X_test) # Load the best model from GridSearchCV() model = load_model(algo, method, 'tuning_random') params = model.best_params_ print(f'Best model parameters: \n{params}\n') #params.update({'n_jobs': n_jobs},)
import tool import tfidf filename = 'tweet.txt' content_list = tool.read_tweet_content(filename) print(content_list[0]) print('- - ' * 50) print(content_list[1]) stem_list = tfidf.preprocess(content_list) print('- - ' * 50) print(stem_list[0]) print('- - ' * 50) print(stem_list[1]) token_2d_list = tfidf.tokenization(stem_list) print(token_2d_list[0]) print(token_2d_list[1]) print('res') res = tfidf.tf_idf(token_2d_list) print('dump') tfidf.dump_tfidf_json(res, 'tfidf.json')
from tfidf import tf_idf from similarity import cosine_similarity import os copyrighted_works='/Users/sm.marouzigmail.com/Documents/job/wattpad/library/' user_text_file = '/Users/sm.marouzigmail.com/Documents/job/wattpad/test/test.txt' # read the content of new uploaded text try: with open(user_text_file, "r", encoding='iso-8859-1') as ifile: raw_text = ifile.read() # raw text of new uploaded file except: with open(user_text_file, "r", encoding='utf-8') as ifile: raw_text = ifile.read() # raw text of new uploaded file tfs_dict1 = tf_idf(raw_text) library_path=os.listdir(copyrighted_works) # list of text files files = [file for file in library_path if file.endswith('.txt')] for file in files: # read text data for each copyrighted content # extract TF_IDF terms and values # calculate the similarity between this vector and uploaded text file vector full_path_file = os.path.join(copyrighted_works, file) # read raw texts one by one from copyright library path try:
parser = argparse.ArgumentParser(description='Generate Inverted Index') parser.add_argument('address', help='csv address', action='store') parser.add_argument('tokenize', help='type of tokenization: simple or pro', default='simple') args = parser.parse_args() csv_address = args.address tokenize_type = args.tokenize print("READING ", csv_address) contents = fetch_column(csv_address, 'content') tokens = tokenize(tokenize_type, contents) print("TOKENIZING ", tokenize_type) if tokenize_type == 'pro': tokens = normilize(tokens) tokens = stem_list(tokens) print("BUILDING DICTIONARY AND INVERTED INDEX") dictionary = build_dictionary(tokens) inverted_index = build_inverted_index(tokens, dictionary) print("WRITING INDEX AND SCORES") writeObj("INVERTED INDEX", inverted_index, tokenize_type, csv_address) tfidfs = [] for i in range(len(tokens)): tfidfs.append(tf_idf(tokens[i], inverted_index, True)) writeObj("TFIDF", tfidfs, tokenize_type, csv_address)
def indices(n): for i, j in product(range(n), range(n)): if j < i: yield i, j if __name__ == '__main__': limit = 1000 tfidf_thresh = 0.02 vocab = open('data/vocab.txt', 'r').read().split('\n') embeddings = np.load('data/embeddings.npy', allow_pickle=False) tok2id = {tok: i for i, tok in enumerate(vocab)} tf, idf = tf_idf(stream_tokens(limit)) # Compute TF-IDFs # and create doc representations print('Creating doc matrices...') pmid_idx = {} mats = [] for pmid, toks in tqdm(stream_tokens(limit)): ems = [] for tok in set(toks): tfidf = tf[pmid][tok] * idf[tok] if tfidf >= tfidf_thresh: tid = tok2id.get(tok) if tid is None: continue em = embeddings[tid]
def make_data(): f = open('cl.sents.txt', 'r', encoding='utf-8') texts = f.readlines() f.close() docs = {} i = 0 for el in texts: docs[i] = el i = i + 1 return docs docs = make_data() index = tf_idf(docs) def make_data2(): f = open('sents.txt', 'r', encoding='utf-8') texts = f.readlines() f.close() docs2 = {} i = 0 for el in texts: docs2[i] = el i = i + 1 return docs2 docs2 = make_data2()
:return: A dictionary mapping words in wordlist to their word counts """ wordlist_with_counts = dict() for word in wordlist: if word in word_counts: wordlist_with_counts[word] = word_counts[word] else: print(word, " not in word_counts dictionary but in given wordlist") return wordlist_with_counts # End of add_word_counts() if __name__ == "__main__": args = wordcount.parse_arguments() records = wordcount.load_records(args.file, False) records = wordcount.preprocess_records(records) frequent_words = wordcount.extract_frequent_words(records, args.num_words * 10, False) frequent_words = dict(frequent_words) tf_idf_scores = tfidf.tf_idf(records) # Pyspark technically ends here - the rest is processed on master node important_words = tfidf.extract_important_words(tf_idf_scores, args.num_words, True) important_words_with_counts = add_word_counts(important_words, frequent_words) synset_dict = generate_syn_set(important_words_with_counts.items()) print_syn_set(synset_dict)