def main(check): # Get data data = readJsonData('./dataset/politifact_results.json') dataset = [] # Set settings checking = 'politifact' k = 13 random_state = 1234 width = 3 # Split for Cross Validation # x_train, x_test = train_test_split( # data, test_size=0.2, random_state=random_state) # test = 40%, train = 60% for i in data.index: if i % 250 == 0: print(str(i)) # Set query and targets query = ' '.join(preprocessing(data['original_article.content'][i])) query_url = data['original_article.url'][i] targets = [] labels = [] for v in data['extracted_articles'][i]: # Remove non necessary elements if check == 1: # Content must be more than n_grams length if not v or not v['content'] or not v['title'] or not v['url']: continue # Preprocessing preprocessed_content = preprocessing(v['content']) string_preprocessed_content = ' '.join(preprocessed_content) targets.append(string_preprocessed_content) labels.append(v['url']) # Dont remove them else: # Content must be more than n_grams length if not v or not v['content'] or not v['title'] or not v['url']: targets.append("") labels.append("Empty" + str(i)) i += 1 else: # Preprocessing preprocessed_content = preprocessing(v['content']) string_preprocessed_content = ' '.join( preprocessed_content) targets.append(string_preprocessed_content) labels.append(v['url']) dataset.append( simhash_1(labels, targets, query, query_url, checking, k, width)) # print('-'*50) appendToDataset("./dataset/simhash_dataset.csv", dataset)
def main(): # Get data data = readJsonData('./dataset/gossipcop_results.json') dataset = [] # Split for Cross Validation # x_train, x_test = train_test_split( # data, test_size=0.2, random_state=1234) # test = 40%, train = 60% # Set settings min_jaccard_value = None n_gram = int(3) n_gram_type = 'term' n_permutations = int(100) no_of_bands = int(50) checking = 'gossipcop' for i in data.index: # Set query and targets preprocessed_query = preprocessing(data['original_article.content'][i]) string_preprocessed_content = ' '.join(preprocessed_query) # Content must be more than n_grams length if n_gram_type == 'char' and len(string_preprocessed_content) < n_gram: continue if n_gram_type == 'term' and len(preprocessed_query) < n_gram: continue query = string_preprocessed_content targets = [query] labels = [data['original_article.url'][i]] for v in data['extracted_articles'][i]: # Content must be more than n_grams length if not v or not v['content'] or not v['title'] or not v['url']: continue # Preprocessing preprocessed_content = preprocessing(v['content']) string_preprocessed_content = ' '.join(preprocessed_content) # Content must be more than n_grams length if n_gram_type == 'char' and len(string_preprocessed_content) < n_gram: continue if n_gram_type == 'term' and len(preprocessed_content) < n_gram: continue targets.append(string_preprocessed_content) labels.append(v['url']) # find near duplicate sequences to `search_string` dataset.append(find_near_duplicate(checking, query, targets, labels, min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type)) # print('-'*50) appendToDataset("./dataset/lsh_dataset.csv", dataset)
def test(): text = """ LTE single-card dual-standby multi-mode terminal and method for processing concurrency of its CS service and PS service The present invention is applicable to the field of communications technologies, and provides an method, the method includes: when a CS service and PS service of a local LTE single-card dual-standby multi-mode terminal are concurrent, detecting, by a local LTE single-card dual-standby multi-mode terminal, whether a peer communication terminal that is performing voice communication with it is in a voice silent period; when detecting that the peer communication terminal is not in the voice silent period, receiving, by the local LTE single-card dual-standby multi-mode terminal, downlink data in an LTE system, and suspending, by the local LTE single-card dual-standby multi-mode terminal, sending of uplink data in the LTE system at the same time; and when detecting that the peer communication terminal is in the voice silent period, sending the uplink data and receiving the downlink data, by the local LTE single-card dual-standby multi-mode terminal, in the LTE system. """ bigram_measures = BigramAssocMeasures() #trigram_measures = TrigramAssocMeasures() # change this to read in your data finder = BigramCollocationFinder.from_words(preprocessing(text)) # only bigrams that appear 3+ times #finder.apply_freq_filter(2) # return the 10 n-grams with the highest PMI #print(finder.nbest(bigram_measures.pmi,50)) #print(finder.nbest(bigram_measures.likelihood_ratio, 20)) #print(finder.nbest(bigram_measures.poisson_stirling, 20)) for x,y in finder.nbest(bigram_measures.likelihood_ratio,50): print(x+' '+y)
def createRandomForest(self, f, ds, train): bootstraps = bootstrap(matrix=train, n=self.ntree) trees = [] accuracies = [] for i, boots in enumerate(bootstraps): ds.dataMatrix = boots[0] x, y, attrList, possibleValuesList = preprocessing(f, ds) tree = self.generateTree(x, y, attrList, possibleValuesList) trees.append(tree) return trees
def TFIDF_1(docs, manuals, topN): _DOCS_NUM = len(docs) docs_words = [] idfCount = {} print('PROCESS--IDF') for i, doc in enumerate(docs): sys.stdout.write("\r{0}/{1}".format(i + 1, _DOCS_NUM)) sys.stdout.flush() words = preprocessing(doc.lower()) docs_words.append(words) for word in set(words): if word in idfCount: idfCount[word] += 1 else: idfCount[word] = 1 tp = 0 predicted_num = 0 candidates = {} print('\nPROCESS--TFIDF and topN predict') for i, doc_words in enumerate(docs_words): sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs_words))) sys.stdout.flush() tfCount = {} tfidf = {} _WORDS_NUM = 0 for word in doc_words: if word in tfCount: tfCount[word] += 1 else: tfCount[word] = 1 _WORDS_NUM += 1 for word, freq in tfCount.items(): idf = math.log10(_DOCS_NUM / idfCount[word]) tfidf[word] = freq / _WORDS_NUM * idf for word, value in tfidf.items(): if word in candidates: candidates[word] += value else: candidates[word] = value predicted = dictTopN(candidates, topN) predicted_num += len(predicted) tp += sum(1 for word in predicted if word in manuals) statics(tp, predicted_num, len(manuals))
def test(file, examples): print "Testing bootstrap for: ", file f = files[file] x, y, attrList, possibleValuesList = preprocessing(f) print possibleValuesList dt = DecisionTree(x, y, attrList, possibleValuesList, int(len( x[0])**0.5)) # **1 for test dataset, **0.5 for the other ones dt.training() print "\nDecision Tree:\n" dt.printTree() for example in examples: print "\n", dt.predict(example) print "\n---------------------------------------------------\n"
def TFIDF(docs): _DOCS_NUM = len(docs) docs_bis = [] #total_bis = [] result = {} #bigram_measures = BigramAssocMeasures() bisFreDist = {} _BI_NUM = 0 for doc in docs: #finder = BigramCollocationFinder.from_words(preprocessing(doc)) #bi = finder.nbest(bigram_measures.likelihood_ratio,2000) bi = list(nltk.bigrams(preprocessing(doc))) docs_bis.append(bi) _BI_NUM += len(bi) #total_bis+=bi for word in bi: if word in bisFreDist: bisFreDist[word] += 1 else: bisFreDist[word] = 1 #bisFreDist = nltk.FreqDist(total_bis) #total_bis = None for word, freq in bisFreDist.items(): try: count = sum(1 for doc_bis in docs_bis if word in doc_bis) idf = math.log10(_DOCS_NUM / count) + 0.01 #print(idf,bi_TFdist.freq(word)) (x, y) = word #result[x.lower()+' '+y.lower()] = bisFreDist.freq(word)*idf result[x.lower() + ' ' + y.lower()] = freq / _BI_NUM * idf except AttributeError: pass bisFreDist = None return result
#!/usr/bin/env python # -*- coding: utf-8 -*- # These two lines are necessary to find source files!!! import sys sys.path.append('../src') from files import files from main import DataSet, preprocessing, DecisionTree if __name__ == '__main__': f = files["test"] x, y, attrList, possibleValuesList = preprocessing(f) dt = DecisionTree(x, y, attrList, possibleValuesList, int(len( x[0])**1)) # **1 for test dataset, **0.5 for the other ones dt.training() print "\nDecision Tree:\n" dt.printTree()
from nltk.collocations import BigramAssocMeasures,BigramCollocationFinder from main import preprocessing text = """ A computer system provides a plug-in architecture for creation of a dynamic font. The computer system can incorporate a new filter function into a filtering layer of a font program. The filtering layer includes pre-defined filter functions to transform a base font into a new font. The computer system applies one or more font rules in the filtering layer to the base font. The font rules are implemented by the new filter function and at least one of the pre-defined filter functions to randomize an appearance of each character in a character string. The character string rendered with the new font has a dynamic and randomized appearance. """ bigram_measures = BigramAssocMeasures() #trigram_measures = TrigramAssocMeasures() # change this to read in your data finder = BigramCollocationFinder.from_words(preprocessing(text)) # only bigrams that appear 3+ times #finder.apply_freq_filter(2) # return the 10 n-grams with the highest PMI print(finder.nbest(bigram_measures.pmi,-1)) print(finder.nbest(bigram_measures.likelihood_ratio, -1)) print(finder.nbest(bigram_measures.poisson_stirling, -1)) """ d = ['09-2012', '04-2007', '11-2012', '05-2013', '12-2006', '05-2006', '08-2007'] sort_index = sorted(d, key=lambda x: datetime.datetime.strptime(x, '%m-%Y'))
def proposed(docs, manuals, topN, alpa): print('=== PROPOSED ====') _DOCS_NUM = len(docs) docs_words = [] _WORDS_NUM = 0 idfCount = {} tfCount = {} occurCount = {} """ Log-likelihood ratio bigrams """ print('PROCESS--Log likelihood ratio bigrams') tmp = [] for doc in docs: tmp += preprocessing(doc.lower()) finder = BigramCollocationFinder.from_words(tmp) #_size = math.floor(len(finder.score_ngrams(BigramAssocMeasures().likelihood_ratio))/20)+1 bigramSet = finder.nbest(BigramAssocMeasures().likelihood_ratio, 200) # finder = TrigramCollocationFinder.from_words(tmp) # _size = math.floor(len(finder.score_ngrams(TrigramAssocMeasures().likelihood_ratio))/40)+1 # trigramSet = finder.nbest(TrigramAssocMeasures().likelihood_ratio, _size) # tmp = None #nltk.RegexpParser('{(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}') for i, doc in enumerate(docs): sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs))) sys.stdout.flush() words = preprocessing(doc.lower()) dealwith = set() for x, y in bigramSet: if x in words and y in words: if x not in dealwith: words.remove(x) dealwith.add(x) if y not in dealwith: words.remove(y) dealwith.add(y) words.append(x + ' ' + y) else: pass # for x,y,z in trigramSet: # if x in words and y in words and z in words: # if x not in dealwith: # words.remove(x) # dealwith.add(x) # if y not in dealwith: # words.remove(y) # dealwith.add(y) # if z not in dealwith: # words.remove(z) # dealwith.add(z) # words.append(x+' '+y+' '+z) # else: # pass docs_words.append(words) _WORDS_NUM += len(words) """ count idf """ for word in set(words): if word in idfCount: idfCount[word] += 1 else: idfCount[word] = 1 """ count occur and tf""" for j, word1 in enumerate(words): if word1 in tfCount: tfCount[word1] += 1 else: tfCount[word1] = 1 for word2 in words[j + 1:]: if (word1, word2) in occurCount: occurCount[(word1, word2)] += 1 elif (word2, word1) in occurCount: occurCount[(word2, word1)] += 1 else: occurCount[(word1, word2)] = 1 # """ Compute PMI""" # for (word1, word2) in occurCount: # val = round(math.log10(occurCount[(word1, word2)]*_WORDS_NUM/tfCount[word1]/tfCount[word2]), 8) # occurCount[(word1, word2)] = val # # """ Construct graph """ # g = nx.Graph() # for (word1, word2),value in occurCount.items(): # g.add_edge(word1, word2, weight=value) # print('Graph node number: %s'%(g.number_of_nodes())) # print('Graph edge number: %s'%(g.number_of_edges())) # occurCount = None # # rwrScore = RWR(g, None, 0.03 , 1000, 0.000003) # _min = min(rwrScore.values()) # _max = max(rwrScore.values()) # print(_min,_max) # for key,value in rwrScore.items(): # rwrScore[key] = (value-_min)/(_max-_min) tp = 0 predicted_num = 0 candidates = {} for i, doc_words in enumerate(docs_words): sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs_words))) sys.stdout.flush() thisTfCount = {} newScore = {} _NUM = len(doc_words) for word in doc_words: if word in thisTfCount: thisTfCount[word] += 1 else: thisTfCount[word] = 1 """ TF-IDF """ for word, tf in thisTfCount.items(): newScore[word] = tf / _NUM * math.log10(_DOCS_NUM / idfCount[word]) _min = min(newScore.values()) _max = max(newScore.values()) for word, tfidf in newScore.items(): #val = alpa*(tfidf-_min)/(_max-_min) + (1-alpa)*rwrScore[word] #val = tfidf + rwrScore[word] if word in candidates: candidates[word] += tfidf else: candidates[word] = tfidf predicted = dictTopN(candidates, topN) predicted_num += len(predicted) tp += sum(1 for word in predicted if word in manuals) statics(tp, predicted_num, len(manuals))
import nltk from nltk.corpus import stopwords import matplotlib.pyplot as plt import main # In[2]: # In[3]: train_data = main.create_dataframe('train.raw') test_data = main.create_dataframe('test.raw') # In[4]: train_processed = train_data.apply( lambda x: main.preprocessing(x, train_data) if x.name in ['sentence', 'words_left', 'words_right'] else x) test_processed = test_data.apply( lambda x: main.preprocessing(x, test_data) if x.name in ['sentence', 'words_left', 'words_right'] else x) # In[5]: # In[28]: # In[30]: word2vec = api.load("glove-twitter-100") # In[6]: