def get(self): # tokenList = word_tokenize("John's big idea isn't all that bad.") # tokenList = pos_tag(word_tokenize("John's big idea isn't all that bad.")) stemmer = PorterStemmer() plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned', 'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational', 'traditional', 'reference', 'colonizer', 'plotted'] singles = [] for plural in plurals: singles.append(stemmer.stem(plural)) self.response.headers['Content-Type'] = 'text/html' self.response.out.write('Hello test!') self.response.out.write(singles) nlProcessor = NLPlib() s = "Very little is known about Beethoven's childhood. He was baptized on December 17, 1770 and was probably born a few days before that. [1][4][5][6] Beethoven's parents were Johann van Beethoven (1740 in Bonn - December 18, 1792) and Maria Magdalena Keverich (1744 in Ehrenbreitstein - July 17, 1787)." v = nlProcessor.tokenize(s) t = nlProcessor.tag(v) for i in range(len(v)): self.response.out.write(v[i] + "(" + t[i] + ")<br/>")
def split_sentences(input_fname, output_fname): ''' ''' tagger = NLPlib.NLPlib() input_f = open(input_fname, "r") output_f = open(output_fname, "w+") output_f.write("|\n") for line in input_f: line = clean(line) sentences = handle_mult_punctuation(line); #print sentences for i in range(len(sentences)): sent = sentences[i].split() tags = tagger.tag(sent) for j in range(len(sent)): sent[j] += ("/" + tags[j]) #print sentences sent_line = " ".join(sent) output_f.write(sent_line) output_f.write("\n") #if line[len(line)-1:] != "\n": # output_f.write("\n") output_f.write("|\n") input_f.close() output_f.close()
def test(): tweets = get_file_data(sys.argv[1]) posts = [] sys.path.append("/home/nsatvik/twitminer/miner") print "1-Sports 2-Politics" tagger = NLPlib() for t in tweets: posts.append(tweet(t, 1)) print posts[-1].get_text() a = input("1 to display tags") if a == 1: words = tagger.tokenize(posts[-1].get_text()) tags = tagger.tag(words) for i in range(len(words)): print words[i], " ", tags[i] else: continue
def tag_PoS(self, texts): if self.tagger is None: self.tagger = NLPlib.NLPlib() processed_texts = [] for sentence in texts: tags = self.tagger.tag(sentence) processed_texts.append( [x + '/' + y for x, y in zip(sentence, tags)]) return processed_texts
def post(self): nlProcessor = NLPlib() content = self.request.get('content') tokens = nlProcessor.tokenize(content) taggedContent = nlProcessor.tag(tokens) content = taggedContent for i in range(len(taggedContent)): isVerb = (taggedContent[i] == "VBD" or taggedContent[i] == "VBZ") if isVerb: correctVerb = tokens[i] tokens[i] = "<select id=\"clozefox_answer\">" tokens[i] += "<option value=\"wrongAnswer\">loves</option>" tokens[i] += "<option value=\"wrongAnswer\">hates</option>" tokens[i] += "<option value=\"trueAnswer\">" + correctVerb + "</option>" tokens[i] += "</select>" content = ' '.join(tokens) self.response.headers['Content-Type'] = 'text/html' self.response.out.write(content)
def script(input, output): abbrevs = load_wordlist('/u/cs401/Wordlists/abbrev.english') male_names = load_wordlist('/u/cs401/Wordlists/maleFirstNames.txt') female_names = load_wordlist('/u/cs401/Wordlists/femaleFirstNames.txt') last_names = load_wordlist('/u/cs401/Wordlists/lastNames.txt') pn_abbrevs = load_wordlist('/u/cs401/Wordlists/pn_abbrev.english') names = male_names + female_names + last_names tagger = NLPlib.NLPlib() outfile = open(output, 'w') with open(input, 'rU') as file: for line in file: out_lines = parse(line, abbrevs, pn_abbrevs, names, tagger) for l in out_lines: outfile.write(l + '\n') outfile.write('|\n') outfile.close()
def tag(tweet): '''Returns a string where each token is tagged using NLPlib in the form of: Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./. ''' tagger = nlp.NLPlib() sentences = tweet.rstrip().split('\n') processed = '' for i in range(len(sentences)): #go through each sentence in a tweet sent = sentences[i].strip().split(' ') tags = tagger.tag(sent) tagged = [] for i in range(len(tags)): tagged.append(sent[i] + '/' + tags[i]) #tag each token in the sentence processed += ' '.join(tagged) + '\n' #join into a processed tweet return '|\n' + processed.rstrip() + '\n'
def main(argv): ''' Main method, responsible for parsing system args ''' use_training = False if len(argv) == 3: use_training = True raw_file = argv[0] result_file = argv[-1] result_output = open(result_file, 'w') if use_training: gid = int(argv[1]) class_one_start = gid * 5500 class_one_end = (gid + 1) * 5500 - 1 class_four_start = class_one_start + 800000 class_four_end = class_one_end + 800000 # Load resources abbrev = load_helper("/u/cs401/Wordlists/abbrev.english") pn_abbrev = load_helper("/u/cs401/Wordlists/pn_abbrev.english") male_names = load_helper("/u/cs401/Wordlists/maleFirstNames.txt") female_names = load_helper("/u/cs401/Wordlists/femaleFirstNames.txt") last_names = load_helper("/u/cs401/Wordlists/lastNames.txt") names = male_names + female_names + last_names # Load tagger tagger = NLPlib.NLPlib() with open (raw_file, 'rU') as file: for i, line in enumerate(file): class_label = remove_double_quotes(line.split(",")[0]) line = remove_double_quotes(line.split(",")[-1]) if use_training: if (i < class_one_end and i >= class_one_start) or (i < class_four_end and i >= class_four_start): lines = parse_line(line, abbrev, pn_abbrev, names, tagger) result_output.write("<A=%s>\n" % class_label) for l in lines: result_output.write(l + "\n") else: lines = parse_line(line, abbrev, pn_abbrev, names, tagger) result_output.write("<A=%s>\n" % class_label) for l in lines: result_output.write(l + "\n")
def __init__(self, tweetArchive, outputFile): """ Constructor; takes the csv and the name of the text file to be written to. """ self.textDump = outputFile openArchive = open(tweetArchive) csvDump = [] for row in openArchive: csvDump.append(row) openArchive.close() self.tweets = [] self.tagger = NLPlib.NLPlib() for i in csvDump: sentiment = i[1] breakdown = i for j in range(5): breakdown = breakdown[breakdown.index(',') + 1:] breakdown = breakdown[1:-1] self.tweets.append([sentiment, breakdown])
def twtt(input_file_name, output_file_name, SID): X = SID % 80 line_index_list = [] row_count = sum(1 for row in csv.reader(open(input_file_name))) tagger = NLPlib.NLPlib() if (row_count >= 1600000): line_index_list.append([X * 10000, (X + 1) * 10000]) line_index_list.append([800000 + X * 10000, 800000 + (X + 1) * 10000]) else: line_index_list.append([0, 20000]) with open(output_file_name, "w") as output_file: with open(input_file_name, 'rb') as csvfile: spamreader = csv.reader(csvfile) for line_index in line_index_list: for row in itertools.islice(spamreader, line_index[0], line_index[1]): tweet = row[5] class_type = int(row[0]) new_line = twtt1(tweet) new_line = twtt2(new_line) new_line = twtt3(new_line) new_line = twtt4(new_line) sentences = twtt5(new_line) if len(sentences) > 0: sentences = [twtt7(sentence) for sentence in sentences] sentences = [ twtt8(sentence, tagger) for sentence in sentences ] sentences = twtt9(sentences, class_type) for sentence in sentences: output_file.write(sentence + '\n')
def main(): if len(sys.argv) != 4: print "This program accepts 3 arguments." sys.exit(1) csv_file = sys.argv[1] studentID = int(sys.argv[2]) output_file = sys.argv[3] with open(csv_file, 'r') as csvfile: raw_data = list(csv.reader(csvfile)) if len(raw_data) > 10000: student_module = studentID % 80 data = raw_data[student_module * 10000: (student_module + 1) * 10000] data.extend(raw_data[800000 + student_module * 10000: 800000 + (student_module + 1) * 10000]) else: data = raw_data print len(data) tagger = NLPlib.NLPlib() output = open(output_file, 'w') for row in data: output.write('<A=' + twtt9(row) + '>\n') output.write(twtt8(twtt7(twtt5(twtt4(twtt3(twtt2(twtt1(row[5])))))), tagger))
#!/usr/bin/python import os import sys import re import NLPlib abbr = {} tagger = NLPlib.NLPlib() def twtt1(tw): tw = re.sub(r'<[^>]+>', '', tw) return tw def twtt2(tw): tw = tw.replace("&",'&').replace("<",'<').replace(">",'>')\ .replace(""",'"').replace("'","'") return tw def twtt3(tw): tw = re.sub(r'([^\w\d])(http://|https://|www\.)[^\s\"]+', r'\1', tw) return tw def twtt4(tw): tw = re.sub(r'[@#]([\w\d]+)', r'\1', tw) return tw
def main(argv): infile = '' outfile = '' if len(argv) != 3: sys.exit('Usage: run twtt.py <input file> <output file>') else: infile = argv[1] outfile = argv[2] # Get file information str = file_op(infile, "r") # Get rid of html tags + attributes str = re.sub(r'<[^>]*>', '', str) # Replace html character codes with ASCII equivalent str = re.sub(r'(&#)([\d]+);', html_num_repl, str) str = re.sub(r'&[\w]+;', html_type_repl, str) #http://code.tutsplus.com/tutorials/8-regular-expressions-you-should-know--net-6149 # All URLs are removed str = re.sub(r'\b(https?:\/\/)?([\da-z\.-]+)\.([a-z]{2,3})([/\w\.-]*)', '', str, flags=re.I) # First character in usernames and hashtags removed str = re.sub(r'(@|#)([^\s]+)', user_hash_repl, str) # Separate tweets with a pipe symbol str = re.sub(r'\n+', '\n|\n', str) # Each sentence within a tweet is on its own line # Ending punctuation is padded by space # Note: Even if ending puncutation followed by lower case, treat as # a sentence - tweets are often not gramatically correct str = re.sub(r'([\.\!\?]+)(?![a-zA-Z].)([\'\"]?)(\s*)', add_newline, str) # Split on semi-colons unless followed by digits (time) str = re.sub(r'(:+)(?!\d)([\'\"]?)(\s*)', add_newline, str) # Separate normal punctuation by spaces # Exclude the period because already separated earlier (ending punctuation) # Dashes need extra space so won't split on hyphens str = re.sub(r'(\s*)(,+|\!+|\?+|;+|\"+|\(+|\)+|\$+'\ '|\#+| -+|-+ )(\s*)', sep_punc, str) # Split on semi-colons unless followed by digits (time) str = re.sub(r'(\s*)(:+)(?!\d)(\s*)', sep_punc, str) # Make it so abbreviations aren't on new lines # Also gets rid of spaces between abbreviation and period abvs = file_op('/u/cs401/Wordlists/pn_abbrev.english', "r") abvs = abvs + '\n' + file_op('/u/cs401/Wordlists/abbrev.english', "r") abvs = re.split(r'.\n+', abvs) str = re.sub(r'\b(' + '|'.join(abvs) + '\b)(\s*)(.)(\s*)(\n)', undo_abv, str, flags=re.I) # Separate possessive apostrophe of plural str = re.sub(r'(\s*)(\')(\s+)', sep_punc, str) # Separate n't clitics str = re.sub(r'(\w+)(n\'t)', sep_clitic, str) # Separate other clitics str = re.sub(r'(\w*[^\WNn])(\'\w+)', sep_clitic, str) # Tag tokens tagger = NLPlib.NLPlib() str_list = re.split(r'[ \t\r\f\v]+', str) tag_list = tagger.tag(str_list) str = '' # Combine tokens with tags for val, tag in itertools.izip(str_list, tag_list): if val: str = str + val + '/' + tag + ' ' # Output the info to a file file_op(outfile, "w", str)
def load_abbreviations(): abrv_1 = open("./Wordlists/abbrev.english", "r").read().split('\n') abrv_2 = open("./Wordlists/pn_abbrev.english", "r").read().split('\n') abrv_1.pop() abrv_2.pop() return map(lambda x : x.lower(), abrv_1 + abrv_2) #TODO add U.S. to abbrivations #print abbreviations if __name__ == "__main__": input_fpntr = open(sys.argv[1], "r") output_fpntr = open(sys.argv[2], "w") clitics = ["'m", "'re", "'s", "'ll", "'ve", "n't"] abbreviations = load_abbreviations() tagger = NLPlib() for line in input_fpntr: #output_fpntr.write('ORG:' + line + '\n') #TODO: delete #substitute ...(ellipsis) with … to avoid multiple periods #line = re.sub("\.[ \t]?\.[ \t]?\.[ \t]?(?:\.[ \t]?)*", " … ", line) #substitute em dash multiple dashes #line = re.sub("--", " — ", line) #output_fpntr.write(line+"\n") line = remove_html_url(line) #output_fpntr.write(line+"\n") line = remove_html_special_char(line) #output_fpntr.write(line+"\n") line = separate_sentences(line) #output_fpntr.write(line)
# This file parses the raw csv data into normalized form. import NLPlib import sys import csv import re import codecs import HTMLParser import io # WARNING: Change this before submitting. ABBR_FILE = '/u/cs401/Wordlists/abbrev.english' TAGGER = NLPlib.NLPlib() CLASSES = [0, 4] CLASS_INDICES = {0: 0, 4: 800000} NUM_TRAIN = 11000 with open(ABBR_FILE) as f: ABBREVIATIONS = map(lambda x: x.rstrip('\n'), f.readlines()) def strip_html(tweet): # Step 1 of part 1. """ Return the input tweet with all html tags and attributes removed. input: tweet - a string representing a tweet. output: tweet - a string representing a tweet. """
def main(args): # For class 0, use lines [GID x 5500 ... (GID+1) x 5500 -1 ] # For class 4, use liens 800,000 + [GID x 5500 ... (GID+1) x 5500 -1] # my group_id = 100 LINES_BTW_CLASS = 800000 c0start = -1 c0end = -1 c4start = -1 c4end = -1 is_group_exist = False print len(args) ## arguments checking if (len(args) == 4): input_filename = args[1] output_filename = args[3] try: group_id = int(args[2]) c0start = (group_id * 5500) c0end = ((group_id + 1) * 5500) - 1 c4start = LINES_BTW_CLASS + c0start c4end = LINES_BTW_CLASS + c0end is_group_exist = True except ValueError: print "Parameter (%s) is not a numeric" % args[2] elif (len(args) == 3): # variables must be a stirngs. input and output input_filename = args[1] output_filename = args[2] group_id = -1 else: print "Wrong number of arguments" print "Usage: python twtt.py <input_filename> <group_number> <output_filename>" sys.exit() print 'Number of arguments:', len(args), 'arguments.' print 'Input csv filename: ', input_filename, len(input_filename) if (group_id != -1): print 'Group ID: ', group_id print 'Output filename: ', output_filename #### # Read CSV file and Write the preprocessing results #### tagger = NLPlib.NLPlib() # init tagger wfp = open(output_filename, "w") # file pointer for writing result into outputfile count = 0 with open(input_filename, 'r+') as f: reader = csv.reader(f) if (group_id != -1): #group id is provided try: for i, row in enumerate(reader): if (i >= c0start and i <= c0end): count = count + 1 tweet = Tweet(row) tweet.do_preprocess() tweet.tagging(tagger) result = tweet.printable_tweet() print result wfp.write(result + "\n") elif (i >= c4start and i <= c4end): count = count + 1 tweet = Tweet(row) tweet.do_preprocess() tweet.tagging(tagger) result = tweet.printable_tweet() print result wfp.write(result + "\n") except csv.Error as e: sys.exit(" file %s, line %d: %s" % (input_filename, reader.line_num, e)) else: # group _id is not provided, use all data try: for i, row in enumerate(reader): tweet = Tweet(row) tweet.do_preprocess() tweet.tagging(tagger) result = tweet.printable_tweet() print result wfp.write(result + "\n") except csv.Error as e: sys.exit(" file %s, line %d: %s" % (input_filename, reader.line_num, e)) print "Count is %s" % count wfp.close()
__author__ = 'Shaham' import nltk from nltk.corpus import wordnet from transcript import * from sklearn.feature_extraction.text import CountVectorizer from sklearn.cross_validation import train_test_split from sklearn import neighbors from sklearn import svm from sklearn.naive_bayes import GaussianNB import random from NLPlib import * nlp_tag = NLPlib() import matplotlib.pyplot as plt import word2vec import gensim patternL = [] with open('Wordlists/laughs.txt', 'rb') as f: for word in f: if len(word) > 1: w = word[:-2] patternL.append(w) patternL.append('$') patternL.append('|') pattern = "".join(patternL[:-1]) #print pattern laughs = re.compile(pattern, re.IGNORECASE) patternL = [] with open('Wordlists/Slang2', 'rb') as f: