def main(): os.chdir("./dataset") # Set default encoding of python to utf8 reload(sys) sys.setdefaultencoding('utf8') ngrams_set = set() with open('output.csv', 'w') as output: csvwriter = csv.writer(output) csvwriter.writerow(["Data File", "Narrative", "N-grams"]) for file in glob.glob("*.txt"): with open(file, 'r') as reader: text = '' for line in reader: text = text + line.rstrip('\n\r').lower() print("\nProcessing " + file) # Tokenization tokens = get_tokens(text) # Part of speech Tagging tag_set = get_pos_tags(tokens) # Technical N-gram extraction ngrams = get_tech_ngrams(text, tag_set) # Write to output csvwriter.writerow([file, text, ngrams.keys()]) ngrams_set = ngrams_set.union(set(ngrams.keys())) ngrams_list = lexicon_expansion(list(ngrams_set)) with open('ngrams.txt', 'w') as writer: for s in ngrams_list: writer.write(str(s) + '\n')
def get_features(review, polarity): features = {} uniqueWords = 0 personalRatio = 0 personal = 0 misspelt = 0 hotelName = 0 personalPronouns = ["i", "me", "we", "our", "ours", "mine"] sentences = sent_tokenize(review) sent = nltk.word_tokenize(review) s = len(sentences) wordsR = regexp_tokenize(review, "\w+") for x in wordsR: if x in personalPronouns: personal += 1 #if x not in set(words.words()): #misspelt+=1 if x in hotels: hotelName += 1 w = len(wordsR) unique = len(set(wordsR)) uniqueWords += unique review = review.replace(" ", "") c = len(review) cap = 0 features['dollar'] = False for i in range(len(review)): if review[i].isupper: cap += 1 if review[i] == '$': features['dollar'] = True ari = 4.71 * (float(c) / w) + 0.5 * (float(w) / s) - 21.43 capRatio = c / float(s) personalRatio += float(personal) / w features['uniqueWords'] = uniqueWords features['personalRatio'] = personalRatio features['ari'] = ari features['capRatio'] = capRatio features['polarity'] = polarity features['hotel'] = hotelName ngrams = get_bigrams(review, 'x') sentiments = get_sentimentFeatures(review, 'x') for x in ngrams.keys(): features[x] = ngrams[x] for x in sentiments.keys(): features[x] = sentiments[x] features['misspelt'] = misspelt return features
def get_features(review,polarity): features = {} uniqueWords = 0 personalRatio = 0 personal = 0 misspelt = 0 hotelName = 0 personalPronouns = ["i","me","we","our","ours","mine"] sentences = sent_tokenize(review) sent = nltk.word_tokenize(review) s = len(sentences) wordsR = regexp_tokenize(review,"\w+") for x in wordsR: if x in personalPronouns: personal+=1 #if x not in set(words.words()): #misspelt+=1 if x in hotels: hotelName+=1 w = len(wordsR) unique = len(set(wordsR)) uniqueWords+=unique review = review.replace(" ","") c = len(review) cap = 0 features['dollar'] = False for i in range(len(review)): if review[i].isupper: cap+=1 if review[i] == '$': features['dollar'] = True ari =4.71*(float(c)/w)+0.5*(float(w)/s)-21.43 capRatio = c/float(s) personalRatio += float(personal)/w features['uniqueWords'] = uniqueWords features['personalRatio'] = personalRatio features['ari'] = ari features['capRatio'] = capRatio features['polarity'] = polarity features['hotel'] = hotelName ngrams = get_bigrams(review,'x') sentiments = get_sentimentFeatures(review,'x') for x in ngrams.keys(): features[x] = ngrams[x] for x in sentiments.keys(): features[x] = sentiments[x] features['misspelt'] = misspelt return features
def GenerateNgrams(text, N): #N = N - 1 # Store words (slices) words = [] # Store ngrams sets ngrams = {} # Iterate over the text rows for i in range(text.shape[0]): word = "" # Create a word (set of numbers separated by ','), we need to do that # because each line (array) of the matrix is a word not a sentence. # Previously we treated each array of the matrix as a sentence and the results were horrible: # e.g. "1-5.csv" (will be adjunt) for j in text[i]: word = word + str(j) + "," # Remove the last ',' because it is not needed word = word[:-1] # Save all the words words.append(word) if N - 1 > 0: # Create all the ngrams sets for i in range(len(words) - (N - 1)): sequence = ' '.join(words[i:i + (N - 1)]) if sequence not in ngrams.keys(): ngrams[sequence] = [] ngrams[sequence].append(words[i + (N - 1)]) if DEPURATION: print("Words: " + str(words) + "\n") print("Ngramas: " + str(ngrams) + "\n") print() return ngrams, words
def main(): dopretests = True try: tests = sys.argv[1] if tests[0] == 'x': dopretests = False tests = tests[1:] if '-' in tests: begintest = int(tests.split('-')[0]) endtest = int(tests.split('-')[1]) else: begintest = endtest = int(tests) except: print("Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",file=sys.stderr) sys.exit(2) try: textfile = sys.argv[2] except: print("Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis",file=sys.stderr) sys.exit(2) try: tmpdir = sys.argv[3] except: tmpdir = "/tmp/" classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls' datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat' modelfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.patternmodel' if not os.path.exists(textfile): print("File does not exist",file=sys.stderr) sys.exit(2) if dopretests: linecount = 0 print("PRETEST #1 - Reading text file (Python)") b = begin() with open(textfile,'r',encoding='utf-8') as f: for line in f: linecount += 1 end(b) print("\t(Read " + str(linecount) + " lines)") print("PRETEST #2 - Building class encoder") encoder = colibricore.ClassEncoder() b = begin() encoder.build(textfile) end(b) print("PRETEST #3 - Saving class encoder") b = begin() encoder.save(classfile) end(b) print("PRETEST #4 - Class encoding corpus") b = begin() encoder.encodefile(textfile, datafile) end(b) print("PRETEST #5 - Unloading encoder") b = begin() del encoder gc.collect() end(b) if begintest < endtest: print("Running tests " , begintest, " to ", endtest) for testnum in range(begintest, min(endtest+1,10)): os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " "+ textfile + " " + tmpdir) else: testnum = begintest print("-------------------- " + colorf('bold','TEST') + " #" + str(testnum) +" ----------------------") if testnum == 1: linecount = 0 print("Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)") ngrams=defaultdict(int) b = begin() with open(textfile,'r',encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1,8): ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 2: print("Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)") from nltk.probability import FreqDist from nltk.util import ngrams fd=FreqDist() b = begin() with open(textfile,'r',encoding='utf-8') as f: for line in f: tokens = line.split(' ') for n in range(1,9): for ngram in ngrams(tokens, n): fd[ngram] += 1 end(b) print("\t(Done)") elif testnum == 3: print("Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel") model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1,maxlength=8,doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) del model if testnum == 4: linecount = 0 print("Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back) (Python defaultdict + Pynlpl Windower)") ngrams=defaultdict(int) b = begin() for n in range(1,9): with open(textfile,'r',encoding='utf-8') as f: for line in f: for ngram in Windower(line, n): docount = True if n>1: for subngram in Windower(ngram,n-1): if not subngram in ngrams: docount = False break if docount: ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") if testnum == 5: linecount = 0 print("Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back) (Python defaultdict + Pynlpl Windower)") ngrams=defaultdict(int) b = begin() with open(textfile,'r',encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1,8): ngrams[ngram] += 1 for ngram in list(ngrams.keys()): if ngrams[ngram] < 2: del ngrams[ngram] gc.collect() end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 6: print("Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel") model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=2,maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) elif testnum == 7: print("Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)") corpus = colibricore.IndexedCorpus(datafile) model = colibricore.UnindexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1,maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) elif testnum == 8: print("Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)") corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1,maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) del model elif testnum == 9: print("Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)") corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2,maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) elif testnum == 10: print("Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)") corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2,maxlength=8, doskipgrams=True) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) elif testnum == 11: print("Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel") model = colibricore.OrderedUnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1,maxlength=8,doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model,modelfile) del model else: print("No such test",file=sys.stderr) print()
def GenerateText(ngrams, words, N, width): if N > 1: # The first N words of the text i.e. ngram currentSequence = ' '.join(words[0:(N - 1)]) else: currentSequence = words[0] # This variable is used for continuing the generation of the text when we aren't able to generate the next word firstSequence = currentSequence if DEPURATION: print("First sequence: " + str(firstSequence)) # Append the first sequence to the output text output = currentSequence # Iterate to create the new words (vertical slices) that are part of the map i = 0 while (i < width): #print("N: ") #print(N-1) if (N - 1) > 0: if currentSequence not in ngrams.keys(): currentSequence = firstSequence possibleWords = ngrams[currentSequence] nextWord = possibleWords[random.randrange(len(possibleWords))] output += ' ' + currentSequence + ' ' + nextWord if DEPURATION: print("Output reset:" + str(output)) print() i += N else: print(currentSequence) possibleWords = ngrams[currentSequence] nextWord = possibleWords[random.randrange(len(possibleWords))] if DEPURATION: print("Possible words: " + str(possibleWords)) print("Next word: " + str(nextWord)) print() output += ' ' + nextWord #i += 1 wordsSequence = output auxSequence = list(wordsSequence.split(' ')) currentSequence = auxSequence[len(auxSequence) - (N - 1):len(auxSequence)] currentSequence = ' '.join(currentSequence) if DEPURATION: print("Next step sequence: " + str(currentSequence)) print() else: nextWord = words[random.randrange(len(words))] output += ' ' + nextWord i += 1 output = list(output.split(' ')) return output
def main(): dopretests = True try: tests = sys.argv[1] if tests[0] == 'x': dopretests = False tests = tests[1:] if '-' in tests: begintest = int(tests.split('-')[0]) endtest = int(tests.split('-')[1]) else: begintest = endtest = int(tests) except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: textfile = sys.argv[2] except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: tmpdir = sys.argv[3] except: tmpdir = "/tmp/" classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls' datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat' modelfile = tmpdir + "/" + os.path.basename( textfile) + '.colibri.patternmodel' if not os.path.exists(textfile): print("File does not exist", file=sys.stderr) sys.exit(2) if dopretests: linecount = 0 print("PRETEST #1 - Reading text file (Python)") b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: linecount += 1 end(b) print("\t(Read " + str(linecount) + " lines)") print("PRETEST #2 - Building class encoder") encoder = colibricore.ClassEncoder() b = begin() encoder.build(textfile) end(b) print("PRETEST #3 - Saving class encoder") b = begin() encoder.save(classfile) end(b) print("PRETEST #4 - Class encoding corpus") b = begin() encoder.encodefile(textfile, datafile) end(b) print("PRETEST #5 - Unloading encoder") b = begin() del encoder gc.collect() end(b) if begintest < endtest: print("Running tests ", begintest, " to ", endtest) for testnum in range(begintest, min(endtest + 1, 10)): os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " + textfile + " " + tmpdir) else: testnum = begintest print("-------------------- " + colorf('bold', 'TEST') + " #" + str(testnum) + " ----------------------") if testnum == 1: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 2: print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)" ) from nltk.probability import FreqDist from nltk.util import ngrams fd = FreqDist() b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: tokens = line.split(' ') for n in range(1, 9): for ngram in ngrams(tokens, n): fd[ngram] += 1 end(b) print("\t(Done)") elif testnum == 3: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model if testnum == 4: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() for n in range(1, 9): with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in Windower(line, n): docount = True if n > 1: for subngram in Windower(ngram, n - 1): if not subngram in ngrams: docount = False break if docount: ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") if testnum == 5: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 for ngram in list(ngrams.keys()): if ngrams[ngram] < 2: del ngrams[ngram] gc.collect() end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 6: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 7: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.UnindexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 8: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model elif testnum == 9: print( "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 10: print( "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8, doskipgrams=True) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 11: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel" ) model = colibricore.OrderedUnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model else: print("No such test", file=sys.stderr) print()
with open("./{}_no_clickbait_big.pickle".format(name), "rb") as f: ngrams_no_clickbait[key] = pickle.load(f) # normalize the counts # ---> count of each ngram / total occurrences of ngram # select top 0.0005 of each (uni, bi, tri, 4), for C and NC separately # loop over the dataset again, separately for C and NC rows # for C data -> count how many posts contain top C list ngrams # for NC data -> count how many posts contain top NC list ngrams clickbait_final_list = [] for n, ngrams in ngrams_clickbait.items(): normalizer = sum(ngrams.values()) for ngram in ngrams.keys(): ngrams[ngram] /= normalizer ngrams_clickbait[n] = ngrams.most_common(int(len(ngrams.keys()) * 0.005)) clickbait_final_list += [elem[0] for elem in ngrams_clickbait[n]] clickbait_final_list = set(clickbait_final_list) no_clickbait_final_list = [] for n, ngrams in ngrams_no_clickbait.items(): normalizer = sum(ngrams.values()) for ngram in ngrams.keys(): ngrams[ngram] /= normalizer ngrams_no_clickbait[n] = ngrams.most_common(int( len(ngrams.keys()) * 0.005))