def precision_max_multiple_references(system, references, n): ''' given: word ww from system output reference sentences r1 r2 r3 r4 ww occurrs o1 times in r1 o2 in r2 o3 in r3 o4 in r4 find max of o1 o2 o3 o4 sum over all ww divide by number of words in system output ''' max_word_count = Counter() p = 0.0 ng_sys = ngrams(system, n) for ww in ng_sys: counts = Counter() for rr in references: ng_refs = ngrams(rr.split(), n) if ww in ng_refs: counts[rr] = ng_refs[ww] else: counts[rr] = 0 max_word_count[ww] = max(counts.values()) return float(sum(max_word_count.values())) / float(len(system))
def run_svm(mode='r',iterations=2): # setup work (generate all the ngrams if they don't exist yet) import os if not os.path.isfile('pos_%sgram.dump' % n): gen_all_ngrams() ind = Indexes(mode,iterations) acc = (0,0,0) # run svm for i in range(iterations): ind.next() (train, gramsdict) = training_set(ind,n=n) classifier = LinearSVMClassifier(data.Data(numpy.array(train, dtype=numpy.uint16).T)) j = ind.get_pos_test_ind()[0] pos = os.listdir("pos") test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("pos/"+pos[j]).read()), label=None) print classifier.classify(test, dtype=numpy.uint16) neg = os.listdir("neg") j = ind.get_neg_test_ind()[0] test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams(n, open("neg/"+neg[j]).read()), label=None) print classifier.classify(test, dtype=numpy.uint16) print m[-2] print m[-1] # p = test_model(m,ind,n=n) # nresults = len(p) # acc = [(a+b) for (a,b) in zip(acc,get_accuracy(p))] print acc return (m,p)
def training_model(ind, n=3): print "Loading features" load_features(n, fmap) print "Feature map size: %s" % fmap.getSize() print "Getting training data" train = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] train.append( (1, [(fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("pos/" + item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_train_ind(): item = os.listdir("neg")[i] train.append((-1, [ (fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("neg/" + item).read()).items() if fmap.hasFeature(item[0]) ])) print "Training model" model = svmlight.learn(train, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') return model
def keywords(passage): # List words and make all singular nouns word = [] words = re.findall(r'\w+', passage) ini_tot_words = len(words) for w in words: if w=='000': w='THOUSAND' # Future work: generalize! if w !='' and len(w) >= 2: if inflect.singular_noun(w) is False: word.append(w) continue else: s = inflect.singular_noun(w) word.append(s) tot_words = len(word) # Count words and select the n-most repeated ones word_counts = Counter(word) key_word_1 = word_counts.most_common(20) # The n-most common single key-word # excluding words shorter than 3 characters all_2key_words = Counter(ngrams(word, 2)) key_words_2 = all_2key_words.most_common(20) # The n-most common bigram (double key-word) # excluding words shorter than 3 characters all_3key_words = Counter(ngrams(word, 3)) key_words_3 = all_3key_words.most_common(20) # The n-most common trigram (triple key-word) # excluding words shorter than 3 characters return(ini_tot_words, tot_words, key_word_1, key_words_2, key_words_3)
def test_model(model,ind,n=3): test = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] test.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_test_ind(): item = os.listdir("neg")[i] test.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])])) predictions = svmlight.classify(model, test) return predictions
def test(self): if self.test_set: for s in range(1, 6): self.test_dir = select_extradata(self.test_set, s) print "Testing with %s" % self.test_dir test_files = os.listdir(self.test_dir) ntest = len(test_files) tests = [{} for i in range(ntest)] for i in range(ntest): for j in self.n: tests[i].update( ngrams.ngrams( j, open("%s/%s" % (self.test_dir, test_files[i])).read(), self.negation)) results = [ self.classifier.classify(i, binary=self.binary) for i in tests ] correct = len([i for i in results if int(i) == 1]) print "%s Stars, Positive: %s of %s, %s accuracy" % ( s, correct, len(tests), (float(correct) / len(tests))) return (0, 0) # return dummy values when testing on external data pos_tests = [{} for f in self.pos_test_data] neg_tests = [{} for f in self.neg_test_data] # Testset --> Feature Vectors for j in self.n: for i in range(len(self.pos_test_data)): pos_tests[i].update( ngrams.ngrams(j, self.pos_test_data[i], self.negation)) for i in range(len(self.neg_test_data)): neg_tests[i].update( ngrams.ngrams(j, self.neg_test_data[i], self.negation)) # Testing pos_results = [ self.classifier.classify(i, binary=self.binary) for i in pos_tests ] pos_correct = len([i for i in pos_results if int(i) == 1]) print "Positive: %s of %s, %s accuracy" % (pos_correct, len(pos_tests), (float(pos_correct) / len(pos_tests))) neg_results = [ self.classifier.classify(i, binary=self.binary) for i in neg_tests ] neg_correct = len([i for i in neg_results if int(i) == -1]) print "Negative: %s of %s, %s accuracy" % (neg_correct, len(neg_tests), (float(neg_correct) / len(neg_tests))) return (float(pos_correct) / len(pos_tests), float(neg_correct) / len(neg_tests))
def precision(system, reference, n): counts = Counter() p = 0.0 ng_sys = ngrams(system, n) for ss in ng_sys: ng_refs = ngrams(reference, n) if ss in ng_refs: counts[ss] += ng_refs[ss] for ng in counts: p += float(counts[ng]) / float(len(system)) return p
def training_set(ind,n=3): """ Caution: Do not use 0 as label because it evaluates to False """ pos = os.listdir("pos") feature_vectors = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()] labels = [1 for i in ind.get_pos_train_ind()] neg = os.listdir("neg") feature_vectors.extend([ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()]) labels.extend([2 for i in ind.get_neg_train_ind()]) (matrix, gramsdict) = ngrams.ngrams_to_matrix(feature_vectors, labels, return_gramsdict=True) return (matrix.asMatrix(), gramsdict)
def precision_multiple_references(system, references, n): counts = Counter() p = 0.0 ng_sys = ngrams(system, n) for ss in ng_sys: for rr in references: ng_refs = ngrams(rr.split(), n) if ss in ng_refs: counts[ss] += ng_refs[ss] print counts for ng in counts: p += float(counts[ng]) / float(len(system)) return p
def perplexity(self, corpus): """ Compute per-token perplexity. The cross-entropy of p with respect to q is defined as: H(p, q) = \sum_x p(x) log_2 q(x) In the case of language modeling, q is the estimated probability distribution and p is the (MLE) probability of each n-gram in the held-out data. Then, the perplexity of p w.r.t. q is simply PPX(p, q) = 2^{H / Z} where Z is the number of tokens in p. """ # frequency of observations over corpus fx = defaultdict(int) # number of observations in corpus Z = 0 for (prefix, suffix) in ngrams(corpus, self.order): Z += 1 fx[(prefix, suffix)] += 1 # collect entropy H = BitWeight(1.) for ((prefix, suffix), f_x) in fx.iteritems(): H *= f_x * self.prob[prefix][suffix] # divide to get per-token entropy, exponentiate to get perplexity return 2 ** (H.bw / Z)
def _compute_prob(self, corpus): """ Compute a conditional frquency distribution and normalize it into BitWeight probabilities """ # conditional frequency distribution representation self.freq = defaultdict(lambda: defaultdict(int)) # populate it for (prefix, suffix) in ngrams(corpus, self.order): self.freq[prefix][suffix] += 1 # initialize log2 probability distribution with default value (0) self.prob = defaultdict(lambda: defaultdict(lambda: BitWeight())) # populate seen probabilities for (prefix, suffixes) in self.freq.iteritems(): # prefix is a tuple of tokens, sffixes is a dictionary # containing token: count key/value pairs denominator = BitWeight(sum(suffixes.values())) # optimize dictionary lookup; both of these are pointers, so # modifying pdist (as we will) modifies self.prob too fdist = self.freq[prefix] pdist = self.prob[prefix] # normalize, using fdist[suffix], the full n-gram's count for suffix in suffixes: # BitWeight class implements division as log-space # subtraction so as to forestall underflow pdist[suffix] = BitWeight(fdist[suffix]) / denominator
def get_ngram_overlap(translation, references): trans_ngrams = ngrams(translation) all_references = [[] for ignored in range(len(trans_ngrams))] for reference in references: ref_ngrams = ngrams(reference) i = 0 while i < len(ref_ngrams): all_references[i].append(ref_ngrams[i]) i += 1 product = 1 i = 0 while i < len(trans_ngrams): matched = match_ngrams(trans_ngrams[i], all_references[i]) product = product * (len(matched) / len(trans_ngrams[i])) i += 1 return product**0.25
def analyse_cable(cable, conn): global NGRAM_INSERT, LOCATION_INSERT idx = cable["id"] # Clean the data content = cable.get("content") content = cleaner.slugify(content) content = cleaner.stopwords(content) # get the ngrams records = ngrams(content, n_max=3) cities = geo.get_cities(content) countries = geo.get_countries(content) cur = conn.cursor(cursor_factory=DictCursor) # Record ngrams, start transaction ngram_rows = [] for token, count in records.iteritems(): row = (idx, token, count, cable['date']) ngram_rows.append(row) # Record geo location location_rows = [] # Collect city to insert for city in cities: row = (idx, city["name"].upper(), "CITY", city["countrycode"], city["latitude"], city["longitude"], cable['date']) location_rows.append(row) # Collect countries to insert for country in countries: row = (idx, country["name"].upper(), "COUNTRY", country["countrycode"], None, None, cable['date']) location_rows.append(row) try: cur.executemany(NGRAM_INSERT, ngram_rows) cur.executemany(LOCATION_INSERT, location_rows) conn.commit() except psycopg2.IntegrityError: # Ingnore the integrity error conn.rollback() return cable
def training_model(ind,n=3): print "Loading features" load_features(n,fmap) print "Feature map size: %s" % fmap.getSize() print "Getting training data" train = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] train.append((1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("pos/"+item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_train_ind(): item = os.listdir("neg")[i] train.append((-1,[(fmap.getID(item[0]),item[1]) for item in ngrams.ngrams(n, open("neg/"+item).read()).items() if fmap.hasFeature(item[0])])) print "Training model" model = svmlight.learn(train, type='classification', verbosity=0) svmlight.write_model(model, 'my_model.dat') return model
def gen_ngrams(n=2, data="pos"): "Generate ngrams and save locally" temp = [] for i in os.listdir("%s" % data): temp.append(open("%s/" % data + i).read()) temp = "\n".join(temp) aggregate_ngrams = ngrams.ngrams(n, temp) pickle.dump(aggregate_ngrams, open("%s_%sgram.dump" % (data, n), 'w'))
def gen_ngrams(n=2,data="pos"): "Generate ngrams and save locally" temp = [] for i in os.listdir("%s" % data): temp.append(open("%s/" % data + i).read()) temp = "\n".join(temp) aggregate_ngrams = ngrams.ngrams(n, temp) pickle.dump(aggregate_ngrams, open("%s_%sgram.dump" % (data,n),'w'))
def train(self): pos_train = [{} for f in self.pos_train_data] neg_train = [{} for f in self.neg_train_data] # Reading files for (j, lim) in zip(self.n, self.limit): all_grams = [ ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data ] for i in range(len(self.pos_train_data)): pos_train[i].update(all_grams[i]) featureslist = all_grams all_grams = [ ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data ] for i in range(len(self.neg_train_data)): neg_train[i].update(all_grams[i]) featureslist.extend(all_grams) # Collapsing, limiting ngrams self.features.update( ngrams.top_ngrams(ngrams.collapse_ngrams(featureslist), lim)) # Creating Index self.classifier = self.clsf(restrictFeatures=self.features) print "# features: %s" % self.classifier.nfeatures if self.idf: print "Using TF-IDF" idf = ngrams.ngrams_to_idf(pos_train + neg_train) for i in range(len(pos_train)): for j in pos_train[i]: pos_train[i][j] = pos_train[i][j] * idf[j] for i in range(len(neg_train)): for j in neg_train[i]: neg_train[i][j] = neg_train[i][j] * idf[j] # Making classifier for i in pos_train: self.count += 1 self.classifier.addFeatureVector(i, 1, binary=self.binary) for i in neg_train: self.classifier.addFeatureVector(i, -1, binary=self.binary) self.classifier.compile()
def test(self): pos_test_votes = False neg_test_votes = False for t in self.testers: pos_tests = [{} for f in t.pos_test_data] neg_tests = [{} for f in t.neg_test_data] for j in t.n: for i in range(len(t.pos_test_data)): pos_tests[i].update( ngrams.ngrams(j, t.pos_test_data[i], self.negation)) for i in range(len(t.neg_test_data)): neg_tests[i].update( ngrams.ngrams(j, t.neg_test_data[i], self.negation)) pos_results = [t.classifier.classify(i) for i in pos_tests] neg_results = [t.classifier.classify(i) for i in neg_tests] if not pos_test_votes: pos_test_votes = pos_results else: for i in range(len(pos_test_votes)): pos_test_votes[i] += pos_results[i] if not neg_test_votes: neg_test_votes = neg_results else: for i in range(len(neg_test_votes)): neg_test_votes[i] += neg_results[i] pos_correct = 0 neg_correct = 0 for i in pos_test_votes: if i > 0: pos_correct += 1 for i in neg_test_votes: if i < 0: neg_correct += 1 print "Positive: %s of %s, %s accuracy" % ( pos_correct, len(pos_test_votes), (float(pos_correct) / len(pos_test_votes))) print "Negative: %s of %s, %s accuracy" % ( neg_correct, len(neg_test_votes), (float(neg_correct) / len(neg_test_votes))) return (float(pos_correct) / len(pos_test_votes), float(neg_correct) / len(neg_test_votes))
def substitution_score_bind(individual): decoded_msg = substitution_cipher(msg, [dict(zip(substitution, alphabet)) for substitution in individual]) decoded_msg = ngrams(decoded_msg, ngram_size) ngrams_frequency = calculate_frequency_norm(decoded_msg) score = 0 for ngram in ngrams_frequency: score += eng_frequency.get(ngram, 0) * ngrams_frequency[ngram] return score
def all_ngrams_multiple_references(n, references): ''' count total ngrams ''' ngram_count = 0 for rr in references: ng_refs = ngrams(rr.split(), n) ngram_count += len(ng_refs) return ngram_count
def test_model(model, ind, n=3): test = [] for i in ind.get_pos_train_ind(): item = os.listdir("pos")[i] test.append( (1, [(fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("pos/" + item).read()).items() if fmap.hasFeature(item[0])])) for i in ind.get_neg_test_ind(): item = os.listdir("neg")[i] test.append((-1, [ (fmap.getID(item[0]), item[1]) for item in ngrams.ngrams(n, open("neg/" + item).read()).items() if fmap.hasFeature(item[0]) ])) predictions = svmlight.classify(model, test) return predictions
def train(self): pos_train = [{} for f in self.pos_train_data] neg_train = [{} for f in self.neg_train_data] # Reading files for (j,lim) in zip(self.n,self.limit): all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.pos_train_data] for i in range(len(self.pos_train_data)): pos_train[i].update(all_grams[i]) featureslist = all_grams all_grams = [ngrams.ngrams(j, f, self.negation) for f in self.neg_train_data] for i in range(len(self.neg_train_data)): neg_train[i].update(all_grams[i]) featureslist.extend(all_grams) # Collapsing, limiting ngrams self.features.update(ngrams.top_ngrams(ngrams.collapse_ngrams( featureslist),lim)) # Creating Index self.classifier = self.clsf(restrictFeatures = self.features) print "# features: %s" % self.classifier.nfeatures if self.idf: print "Using TF-IDF" idf = ngrams.ngrams_to_idf(pos_train + neg_train) for i in range(len(pos_train)): for j in pos_train[i]: pos_train[i][j] = pos_train[i][j] * idf[j] for i in range(len(neg_train)): for j in neg_train[i]: neg_train[i][j] = neg_train[i][j] * idf[j] # Making classifier for i in pos_train: self.count += 1 self.classifier.addFeatureVector(i, 1, binary=self.binary) for i in neg_train: self.classifier.addFeatureVector(i, -1, binary=self.binary) self.classifier.compile()
def substitution_score_bind(substitution): decoded_msg = substitution_cipher( msg, [dict(zip(substitution['alphabet'], alphabet))]) decoded_msg = ngrams(decoded_msg, ngram_size) ngrams_frequency = calculate_frequency_norm(decoded_msg) for ngram in ngrams_frequency: if substitution['score'] is None: substitution['score'] = 0 substitution['score'] += eng_frequency.get( ngram, 0) * ngrams_frequency[ngram] return substitution
def run_svm(mode='r', iterations=2): # setup work (generate all the ngrams if they don't exist yet) import os if not os.path.isfile('pos_%sgram.dump' % n): gen_all_ngrams() ind = Indexes(mode, iterations) acc = (0, 0, 0) # run svm for i in range(iterations): ind.next() (train, gramsdict) = training_set(ind, n=n) classifier = LinearSVMClassifier( data.Data(numpy.array(train, dtype=numpy.uint16).T)) j = ind.get_pos_test_ind()[0] pos = os.listdir("pos") test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams( n, open("pos/" + pos[j]).read()), label=None) print classifier.classify(test, dtype=numpy.uint16) neg = os.listdir("neg") j = ind.get_neg_test_ind()[0] test = ngrams.grams_to_featurevector(gramsdict, ngrams.ngrams( n, open("neg/" + neg[j]).read()), label=None) print classifier.classify(test, dtype=numpy.uint16) print m[-2] print m[-1] # p = test_model(m,ind,n=n) # nresults = len(p) # acc = [(a+b) for (a,b) in zip(acc,get_accuracy(p))] print acc return (m, p)
def test(self): if self.test_set: for s in range(1,6): self.test_dir = select_extradata(self.test_set,s) print "Testing with %s" % self.test_dir test_files = os.listdir(self.test_dir) ntest = len(test_files) tests = [{} for i in range(ntest)] for i in range(ntest): for j in self.n: tests[i].update(ngrams.ngrams(j, open("%s/%s" % ( self.test_dir,test_files[i])).read(), self.negation)) results = [self.classifier.classify(i,binary=self.binary) for i in tests] correct = len([i for i in results if int(i) == 1]) print "%s Stars, Positive: %s of %s, %s accuracy" % (s,correct,len(tests), (float(correct)/len(tests))) return (0,0) # return dummy values when testing on external data pos_tests = [{} for f in self.pos_test_data] neg_tests = [{} for f in self.neg_test_data] # Testset --> Feature Vectors for j in self.n: for i in range(len(self.pos_test_data)): pos_tests[i].update(ngrams.ngrams(j, self.pos_test_data[i], self.negation)) for i in range(len(self.neg_test_data)): neg_tests[i].update(ngrams.ngrams(j, self.neg_test_data[i], self.negation)) # Testing pos_results = [self.classifier.classify(i,binary=self.binary) for i in pos_tests] pos_correct = len([i for i in pos_results if int(i) == 1]) print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_tests), (float(pos_correct)/len(pos_tests))) neg_results = [self.classifier.classify(i,binary=self.binary) for i in neg_tests] neg_correct = len([i for i in neg_results if int(i) == -1]) print "Negative: %s of %s, %s accuracy" % (neg_correct,len(neg_tests), (float(neg_correct)/len(neg_tests))) return (float(pos_correct)/len(pos_tests), float(neg_correct)/len(neg_tests))
def test(self): pos_test_votes = False neg_test_votes = False for t in self.testers: pos_tests = [{} for f in t.pos_test_data] neg_tests = [{} for f in t.neg_test_data] for j in t.n: for i in range(len(t.pos_test_data)): pos_tests[i].update(ngrams.ngrams(j, t.pos_test_data[i], self.negation)) for i in range(len(t.neg_test_data)): neg_tests[i].update(ngrams.ngrams(j, t.neg_test_data[i], self.negation)) pos_results = [t.classifier.classify(i) for i in pos_tests] neg_results = [t.classifier.classify(i) for i in neg_tests] if not pos_test_votes: pos_test_votes = pos_results else: for i in range(len(pos_test_votes)): pos_test_votes[i] += pos_results[i] if not neg_test_votes: neg_test_votes = neg_results else: for i in range(len(neg_test_votes)): neg_test_votes[i] += neg_results[i] pos_correct = 0 neg_correct = 0 for i in pos_test_votes: if i > 0: pos_correct += 1 for i in neg_test_votes: if i < 0: neg_correct += 1 print "Positive: %s of %s, %s accuracy" % (pos_correct,len(pos_test_votes), (float(pos_correct)/len(pos_test_votes))) print "Negative: %s of %s, %s accuracy" % (neg_correct,len(neg_test_votes), (float(neg_correct)/len(neg_test_votes))) return (float(pos_correct)/len(pos_test_votes), float(neg_correct)/len(neg_test_votes))
def training_set(ind, n=3): """ Caution: Do not use 0 as label because it evaluates to False """ pos = os.listdir("pos") feature_vectors = [ ngrams.ngrams(n, open("pos/" + pos[i]).read()) for i in ind.get_pos_train_ind() ] labels = [1 for i in ind.get_pos_train_ind()] neg = os.listdir("neg") feature_vectors.extend([ ngrams.ngrams(n, open("neg/" + neg[i]).read()) for i in ind.get_neg_train_ind() ]) labels.extend([2 for i in ind.get_neg_train_ind()]) (matrix, gramsdict) = ngrams.ngrams_to_matrix(feature_vectors, labels, return_gramsdict=True) return (matrix.asMatrix(), gramsdict)
def counts_max_multiple_references(system, references, n): ''' given: word ww from system output reference sentences r1 r2 r3 r4 ww occurrs o1 times in r1 o2 in r2 o3 in r3 o4 in r4 find max of o1 o2 o3 o4 sum over all ww ''' max_word_count = Counter() ng_sys = ngrams(system, n) for ww in ng_sys: counts = Counter() for rr in references: ng_refs = ngrams(rr.split(), n) if ww in ng_refs: counts[rr] = ng_refs[ww] else: counts[rr] = 0 max_word_count[ww] = max(counts.values()) return sum(max_word_count.values())
def addbigrams(dft,dfte,df1,selector=0,n=50): top = topwords(df1,'Clean tweet',n) bigrams=ngrams(df1,'Clean tweet') bigramsw=bigrams.bigrams main_domain = join(dft,'Clean tweet') main_domain1 = join(dfte,'Clean tweet') main_domain.joinall(bigramsw,2) main_domain1.joinall(bigramsw,2) return main_domain.df, main_domain1.df
def getmostcommon(df,df1,n=10): main_domain = join(df,'Clean tweet') main_domain1 = join(df1,'Clean tweet') top = topwords(self.df2,'Clean tweet',n) bigrams=ngrams(self.df2,'Clean tweet',n) topw=top.top bigramsw=bigrams.bigrams main_domain.joinall(topw,1) main_domain.joinall(bigramsw,2) main_domain1.joinall(topw,1) main_domain1.joinall(bigramsw,2) return main_domain.df,main_domain1.df,
def getintersection(df,selector=0,n=50): main_domain = join(df,'Clean tweet') top = topwords(df,'Clean tweet',n) bigrams=ngrams(df,'Clean tweet') topw=top.top bigramsw=bigrams.bigrams main_domain.joinall(topw,1) mutualwordsu= mutualinfo(main_domain.df) main_domain.joinall(bigramsw,2) mutualwordsb= mutualinfo(main_domain.df) mutualwordsb=[e for e in mutualwordsb if e not in mutualwordsu] ratiov=ratio(main_domain.df,'L') ratios=ratiov.getoddratios(top.top) dratios=list(ratios.keys()) return topw, bigramsw, dratios,mutualwordsu,mutualwordsb
def sentencer(inpfile): in1 = open(inpfile,"r") content = str(in1.read()) #txt = ' '.join(content.split()) txt = content.split(".") del(txt[-1]) root_dict = dict() ngram_dict = dict() linecounter = 1 for i in txt: currentline = i.strip() #print "Current Line : " + currentline root_list = inflection.stem(currentline) root_dict[linecounter] = root_list linecounter += 1 in1.close() for linenumber, rootlist in root_dict.iteritems(): #print "Line Number : " + str(linenumber) #print "Number of root words = " + str(len(rootlist)) ngram = ngrams.ngrams(rootlist, 1) ngram_dict[linenumber] = ngram return ngram_dict
from EntityRelation import EntityRelation print "candidate generation started" ER = EntityRelation(sentences_path, full_sentence_path, pos_path, full_pos_path, frequent_patterns_path, significance, out_path, capitalize) ER.extract() print 'Candidate generation done.' """ Graph """ os.chdir(os.path.join(src_file_path, "src")) ### step 0 from step0 import step0 step0(out_path, DataStatsFile, data_path) """ Word counts """ os.chdir(os.path.join(src_file_path, "EntityCleaning")) from ngrams import ngrams ngrams(raw_text_path, intermediate_data_path) """ RUN with either PMI or t-score because segement file will be updated based on collocation score """ """ cleaning entity mentions based on collocation measure(PMI) """ if collocation_measure == "PMI": os.chdir(os.path.join(src_file_path, "EntityCleaning")) from collocation import collocation collocation(Entity_file_path, intermediate_data_path, num_lines, "PMI") if collocation_measure == "t-score": """ cleaning entity mentions based on collocation measure(t-score) """ os.chdir(os.path.join(src_file_path, "EntityCleaning")) from collocation import collocation collocation(Entity_file_path, intermediate_data_path, num_lines, "t-score") """ outlier detection in seed file by type """ os.chdir(os.path.join(src_file_path, "NoiseDetection")) from get_seed_file import get_seed_file
from Indexes import Indexes import matplotlib.pyplot as plt from classifier import MaximumEntropyClassifier TRAIN_SIZE = 800 n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r',1,TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_train_ind()] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams),16165) neg_grams = [ngrams.ngrams(n, open("neg/"+neg[i]).read()) for i in ind.get_neg_train_ind()] neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams),16165) print "> collapsed grams" trainingset = [([k],'pos',v) for (k,v) in pos_collapsed_grams.iteritems()] trainingset.extend([([k],'neg',v) for (k,v) in neg_collapsed_grams.iteritems()]) m = MaximumEntropyClassifier(trainingset) print "> created model" pos_res = [] neg_res = [] pos_tests = [ngrams.ngrams(n, open("pos/"+pos[i]).read()) for i in ind.get_pos_test_ind()] pos_results = [m.classify(test) for test in pos_tests] pos_correct = len([i for i in pos_results if i >= 0.5])
import matplotlib.pyplot as plt from classifier import MaximumEntropyClassifier TRAIN_SIZE = 800 n = 1 print "Maximum Entropy" pos = os.listdir("pos") neg = os.listdir("neg") ind = Indexes('r', 1, TRAIN_SIZE) print "> determined Indices" ind.next() pos_grams = [ ngrams.ngrams(n, open("pos/" + pos[i]).read()) for i in ind.get_pos_train_ind() ] pos_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(pos_grams), 16165) neg_grams = [ ngrams.ngrams(n, open("neg/" + neg[i]).read()) for i in ind.get_neg_train_ind() ] neg_collapsed_grams = ngrams.top_ngrams(ngrams.collapse_ngrams(neg_grams), 16165) print "> collapsed grams" trainingset = [([k], 'pos', v) for (k, v) in pos_collapsed_grams.iteritems()] trainingset.extend([([k], 'neg', v)
def index_of_coincidence(sequence): sequence = ngrams(sequence, 1) ngrams_frequency = calculate_frequency_norm(sequence) return reduce( lambda acc, symbol: acc + (ngrams_frequency[symbol] / 100)**2, ngrams_frequency, 0)
def domain(document, crossvalidationundersampling,ArffL,A=0, undersampler=0,sentiment=0 ): test=pd.read_csv('documents\csv\drunk\drunkTEXT400'+'.csv' ) test.L=test.L.replace(['y','n'], ['True','False']) df1=pd.read_csv(document+'.csv' ) df1.L=df1.L.replace(['y','n'], ['True','False']) joinc=joindocuments(df1,df1) top = topwords(df1,'Clean tweet',100) main_domain = join(df1,'Clean tweet') bigrams=ngrams(df1,'Clean tweet') print 'bigrams' print bigrams.bigrams main_domain.joinall(bigrams.bigrams,2) main_domain.joinall(top.top,1) main_domain.df.to_csv('prueba.csv',index=False) ratiov=ratio(main_domain.df,'L') ratios=ratiov.getoddratios(top.top) print 'ratios' print ratios ds=list(ratios.keys()) testobject = join(test,'Clean tweet') oddradiojoin=join(df1,'Clean tweet') oddradiojoin.joinall(ds,1) testobject.joinall(ds,1) oddradiojoin.joinall(bigrams.bigrams,2) testobject.joinall(bigrams.bigrams,2) test=testobject.df cols=['Clean tweet'] if sentiment==1: cols=['Clean tweet','sentiment_polarity', 'sentiment_subjectivity', 'absPolarity'] try: for x in cols: del oddradiojoin.df[x] del test[x] except: pass #training, test=joinc.gettrainingandtestp(oddradiojoin.df) print 'matrix of elements to reduce' print "saul,",oddradiojoin.df.shape ######################################################### if undersampler==1: print "saul,",oddradiojoin.df.shape oddradiojoin.df=joinc.undersampling(oddradiojoin.df) print oddradiojoin.df.shape if A==1: dftraining, dftest=pcaf(oddradiojoin.df,test) oddradiojoin.df =dftraining.join(oddradiojoin.df["L"]) test=dftest.join(test["L"]) print oddradiojoin.df.shape training=oddradiojoin.df training=training.replace(['True','False'], [True,False]) test=test.replace(['True','False'], [True,False]) training=training.astype(np.float64) test=test.astype(np.float64) training['L']=training['L'].astype(bool) test['L']=test['L'].astype(bool) A=str(A) sentiment=str(sentiment) oddradiojoin.df.to_csv('crossvalidation.csv',index=False) #undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False) headers_names=list(training.columns.values) headers_names.remove('L') headers_names.append('L') headers_names1=list(test.columns.values) print headers_names,'heathers test',headers_names1 test = test[headers_names] training = training[headers_names] print 'training' +str(training.dtypes) test.to_csv(str(crossvalidationundersampling) + r'\test1'+A+'.csv',index=False) training.to_csv(str(crossvalidationundersampling) +r'\training1'+A+'.csv',index=False) TRAINING=training.as_matrix(columns=None) TEST=test.as_matrix(columns=None) print 'training' print training.dtypes arff.dump(ArffL +r'\trainingwu'+A+str(undersampler)+sentiment+'.arff',TRAINING, relation="whatever", names=headers_names) arff.dump(ArffL +r'\testwu'+A+str(undersampler)+sentiment+'.arff',TEST, relation="whatever", names=headers_names)
def calculate_brevity_penalty(translation, references): mean_ref_len = statistics.mean([len(ngrams(ref)[0]) for ref in references]) mean_ref_len += 0.5 return min(1, (len(ngrams(translation)[0]) / math.floor(mean_ref_len)))
eng_frequency = pd.read_csv('../ngrams-frequency/letter_frequency.csv') eng_frequency['ngram'] = eng_frequency['ngram'].map( lambda ng: tuple([s for s in ng])) eng_frequency = eng_frequency.set_index('ngram') KEY_RANGE = 256 KEY_LEN = key_len(encoded_utf8) KEY = [] for i in range(KEY_LEN): msg = encoded_utf8[i::KEY_LEN] best_key = -1 best_score = sys.maxsize for key in range(KEY_RANGE): decoded_msg = vigenere_cipher(msg, chr(key).encode()) decoded_msg = ngrams(decoded_msg, 1) ngrams_frequency = calculate_frequency_norm(decoded_msg) score = 0 for ngram in ngrams_frequency: if ngram not in ALPHABET: score = sys.maxsize break else: score += (ngrams_frequency[ngram] - eng_frequency.get(ngram, 0))**2 score = score**0.5 best_score, best_key = (score, key) if score < best_score else (best_score, best_key)
def domain(document, crossvalidationundersampling, ArffL, A=0, undersampler=0, sentiment=0): test = pd.read_csv('documents\csv\drunk\drunkTEXT400' + '.csv') test.L = test.L.replace(['y', 'n'], ['True', 'False']) df1 = pd.read_csv(document + '.csv') df1.L = df1.L.replace(['y', 'n'], ['True', 'False']) joinc = joindocuments(df1, df1) top = topwords(df1, 'Clean tweet', 100) main_domain = join(df1, 'Clean tweet') bigrams = ngrams(df1, 'Clean tweet') print 'bigrams' print bigrams.bigrams main_domain.joinall(bigrams.bigrams, 2) main_domain.joinall(top.top, 1) main_domain.df.to_csv('prueba.csv', index=False) ratiov = ratio(main_domain.df, 'L') ratios = ratiov.getoddratios(top.top) print 'ratios' print ratios ds = list(ratios.keys()) testobject = join(test, 'Clean tweet') oddradiojoin = join(df1, 'Clean tweet') oddradiojoin.joinall(ds, 1) testobject.joinall(ds, 1) oddradiojoin.joinall(bigrams.bigrams, 2) testobject.joinall(bigrams.bigrams, 2) test = testobject.df cols = ['Clean tweet'] if sentiment == 1: cols = [ 'Clean tweet', 'sentiment_polarity', 'sentiment_subjectivity', 'absPolarity' ] try: for x in cols: del oddradiojoin.df[x] del test[x] except: pass #training, test=joinc.gettrainingandtestp(oddradiojoin.df) print 'matrix of elements to reduce' print "saul,", oddradiojoin.df.shape ######################################################### if undersampler == 1: print "saul,", oddradiojoin.df.shape oddradiojoin.df = joinc.undersampling(oddradiojoin.df) print oddradiojoin.df.shape if A == 1: dftraining, dftest = pcaf(oddradiojoin.df, test) oddradiojoin.df = dftraining.join(oddradiojoin.df["L"]) test = dftest.join(test["L"]) print oddradiojoin.df.shape training = oddradiojoin.df training = training.replace(['True', 'False'], [True, False]) test = test.replace(['True', 'False'], [True, False]) training = training.astype(np.float64) test = test.astype(np.float64) training['L'] = training['L'].astype(bool) test['L'] = test['L'].astype(bool) A = str(A) sentiment = str(sentiment) oddradiojoin.df.to_csv('crossvalidation.csv', index=False) #undersampleddf1.to_csv(str(crossvalidationundersampling) +'\undersampling'+A+'.csv',index=False) headers_names = list(training.columns.values) headers_names.remove('L') headers_names.append('L') headers_names1 = list(test.columns.values) print headers_names, 'heathers test', headers_names1 test = test[headers_names] training = training[headers_names] print 'training' + str(training.dtypes) test.to_csv(str(crossvalidationundersampling) + r'\test1' + A + '.csv', index=False) training.to_csv(str(crossvalidationundersampling) + r'\training1' + A + '.csv', index=False) TRAINING = training.as_matrix(columns=None) TEST = test.as_matrix(columns=None) print 'training' print training.dtypes arff.dump(ArffL + r'\trainingwu' + A + str(undersampler) + sentiment + '.arff', TRAINING, relation="whatever", names=headers_names) arff.dump(ArffL + r'\testwu' + A + str(undersampler) + sentiment + '.arff', TEST, relation="whatever", names=headers_names)