Exemple #1
0
    def get(self):
        # tokenList = word_tokenize("John's big idea isn't all that bad.")
        # tokenList = pos_tag(word_tokenize("John's big idea isn't all that bad.")) 

        stemmer = PorterStemmer()
        plurals = ['caresses', 'flies', 'dies', 'mules', 'denied',
                   'died', 'agreed', 'owned', 'humbled', 'sized',
                   'meeting', 'stating', 'siezing', 'itemization',
                   'sensational', 'traditional', 'reference', 'colonizer',
                   'plotted']
        singles = []
        for plural in plurals:
            singles.append(stemmer.stem(plural))


        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write('Hello test!')
        self.response.out.write(singles)


        nlProcessor = NLPlib()

        s = "Very little is known about Beethoven's childhood. He was baptized  on December 17, 1770 and was probably born a few days before that. [1][4][5][6]  Beethoven's parents were Johann van Beethoven (1740 in Bonn - December 18, 1792) and Maria Magdalena Keverich (1744 in Ehrenbreitstein - July 17, 1787)."

        v = nlProcessor.tokenize(s)
        t = nlProcessor.tag(v)
        for i in range(len(v)):
            self.response.out.write(v[i] + "(" + t[i] + ")<br/>")
Exemple #2
0
def split_sentences(input_fname, output_fname):
    '''
    
    '''
    tagger = NLPlib.NLPlib()
    input_f = open(input_fname, "r")
    output_f = open(output_fname, "w+")
    output_f.write("|\n")
    for line in input_f:
        
        line = clean(line)
        
        sentences = handle_mult_punctuation(line);
        #print sentences
        
        for i in range(len(sentences)):
            sent = sentences[i].split()
            tags = tagger.tag(sent)            
            
            for j in range(len(sent)):
                sent[j] += ("/" + tags[j])
        #print sentences
        
            sent_line = " ".join(sent) 
            
            output_f.write(sent_line)
            output_f.write("\n")
        #if line[len(line)-1:] != "\n":
        #   output_f.write("\n")
        output_f.write("|\n")
    input_f.close()
    output_f.close()
def test():
    tweets = get_file_data(sys.argv[1])
    posts = []
    sys.path.append("/home/nsatvik/twitminer/miner")
    print "1-Sports 2-Politics"
    tagger = NLPlib()
    for t in tweets:
        posts.append(tweet(t, 1))
        print posts[-1].get_text()
        a = input("1 to display tags")
        if a == 1:
            words = tagger.tokenize(posts[-1].get_text())
            tags = tagger.tag(words)
            for i in range(len(words)):
                print words[i], " ", tags[i]

        else:
            continue
Exemple #4
0
 def tag_PoS(self, texts):
     if self.tagger is None:
         self.tagger = NLPlib.NLPlib()
     processed_texts = []
     for sentence in texts:
         tags = self.tagger.tag(sentence)
         processed_texts.append(
             [x + '/' + y for x, y in zip(sentence, tags)])
     return processed_texts
Exemple #5
0
    def post(self):

        nlProcessor = NLPlib()
        content = self.request.get('content')
        tokens = nlProcessor.tokenize(content) 
        taggedContent = nlProcessor.tag(tokens)

        content = taggedContent

        for i in range(len(taggedContent)):
            isVerb = (taggedContent[i] == "VBD" or taggedContent[i] == "VBZ") 
            if isVerb:
                correctVerb = tokens[i]
                tokens[i] = "<select id=\"clozefox_answer\">"
                tokens[i] += "<option value=\"wrongAnswer\">loves</option>" 
                tokens[i] += "<option value=\"wrongAnswer\">hates</option>" 
                tokens[i] += "<option  value=\"trueAnswer\">" + correctVerb + "</option>"
                tokens[i] += "</select>"
        
        content = ' '.join(tokens)

        self.response.headers['Content-Type'] = 'text/html'
        self.response.out.write(content)
Exemple #6
0
def script(input, output):
    abbrevs = load_wordlist('/u/cs401/Wordlists/abbrev.english')
    male_names = load_wordlist('/u/cs401/Wordlists/maleFirstNames.txt')
    female_names = load_wordlist('/u/cs401/Wordlists/femaleFirstNames.txt')
    last_names = load_wordlist('/u/cs401/Wordlists/lastNames.txt')
    pn_abbrevs = load_wordlist('/u/cs401/Wordlists/pn_abbrev.english')
    names = male_names + female_names + last_names
    tagger = NLPlib.NLPlib()
    outfile = open(output, 'w')
    with open(input, 'rU') as file:
        for line in file:
            out_lines = parse(line, abbrevs, pn_abbrevs, names, tagger)
            for l in out_lines:
                outfile.write(l + '\n')
            outfile.write('|\n')
    outfile.close()
def tag(tweet):
    '''Returns a string where each token is tagged using NLPlib in the form of:
  Meet/VB me/PRP today/NN at/IN the/DT FEC/NN in/IN DC/NN at/IN 4/NN ./.
  '''
    tagger = nlp.NLPlib()
    sentences = tweet.rstrip().split('\n')
    processed = ''
    for i in range(len(sentences)):  #go through each sentence in a tweet
        sent = sentences[i].strip().split(' ')
        tags = tagger.tag(sent)
        tagged = []
        for i in range(len(tags)):
            tagged.append(sent[i] + '/' +
                          tags[i])  #tag each token in the sentence
        processed += ' '.join(tagged) + '\n'  #join into a processed tweet
    return '|\n' + processed.rstrip() + '\n'
def main(argv):
  '''
  Main method, responsible for parsing system args
  '''
  use_training = False
  if len(argv) == 3:
    use_training = True

  raw_file = argv[0]
  result_file = argv[-1]
  result_output = open(result_file, 'w')

  if use_training:
    gid = int(argv[1])
    class_one_start = gid * 5500
    class_one_end = (gid + 1) * 5500 - 1
    class_four_start = class_one_start + 800000
    class_four_end = class_one_end + 800000

  # Load resources
  abbrev = load_helper("/u/cs401/Wordlists/abbrev.english")
  pn_abbrev = load_helper("/u/cs401/Wordlists/pn_abbrev.english")
  male_names = load_helper("/u/cs401/Wordlists/maleFirstNames.txt")
  female_names = load_helper("/u/cs401/Wordlists/femaleFirstNames.txt")
  last_names = load_helper("/u/cs401/Wordlists/lastNames.txt")
  names = male_names + female_names + last_names

  # Load tagger
  tagger = NLPlib.NLPlib()

  with open (raw_file, 'rU') as file:
    for i, line in enumerate(file):
      class_label = remove_double_quotes(line.split(",")[0])
      line = remove_double_quotes(line.split(",")[-1])

      if use_training:
        if (i < class_one_end and i >= class_one_start) or (i < class_four_end and i >= class_four_start):
          lines = parse_line(line, abbrev, pn_abbrev, names, tagger)
          result_output.write("<A=%s>\n" % class_label)
          for l in lines:
            result_output.write(l + "\n")
      else:
        lines = parse_line(line, abbrev, pn_abbrev, names, tagger)
        result_output.write("<A=%s>\n" % class_label)
        for l in lines:
          result_output.write(l + "\n")
    def __init__(self, tweetArchive, outputFile):
        """
		Constructor; takes the csv and the name of the text file to be written to.
		"""
        self.textDump = outputFile
        openArchive = open(tweetArchive)
        csvDump = []
        for row in openArchive:
            csvDump.append(row)
        openArchive.close()
        self.tweets = []
        self.tagger = NLPlib.NLPlib()
        for i in csvDump:
            sentiment = i[1]
            breakdown = i
            for j in range(5):
                breakdown = breakdown[breakdown.index(',') + 1:]
            breakdown = breakdown[1:-1]
            self.tweets.append([sentiment, breakdown])
Exemple #10
0
def twtt(input_file_name, output_file_name, SID):

    X = SID % 80
    line_index_list = []
    row_count = sum(1 for row in csv.reader(open(input_file_name)))

    tagger = NLPlib.NLPlib()
    if (row_count >= 1600000):
        line_index_list.append([X * 10000, (X + 1) * 10000])
        line_index_list.append([800000 + X * 10000, 800000 + (X + 1) * 10000])
    else:
        line_index_list.append([0, 20000])

    with open(output_file_name, "w") as output_file:
        with open(input_file_name, 'rb') as csvfile:

            spamreader = csv.reader(csvfile)

            for line_index in line_index_list:
                for row in itertools.islice(spamreader, line_index[0],
                                            line_index[1]):
                    tweet = row[5]
                    class_type = int(row[0])
                    new_line = twtt1(tweet)
                    new_line = twtt2(new_line)
                    new_line = twtt3(new_line)
                    new_line = twtt4(new_line)
                    sentences = twtt5(new_line)

                    if len(sentences) > 0:
                        sentences = [twtt7(sentence) for sentence in sentences]
                        sentences = [
                            twtt8(sentence, tagger) for sentence in sentences
                        ]

                    sentences = twtt9(sentences, class_type)

                    for sentence in sentences:
                        output_file.write(sentence + '\n')
def main():
	if len(sys.argv) != 4:
		print "This program accepts 3 arguments."
		sys.exit(1)

	csv_file = sys.argv[1]
	studentID = int(sys.argv[2])
	output_file = sys.argv[3]

	with open(csv_file, 'r') as csvfile:
		raw_data = list(csv.reader(csvfile))
		if len(raw_data) > 10000:
			student_module = studentID % 80
			data = raw_data[student_module * 10000: (student_module + 1) * 10000]
			data.extend(raw_data[800000 + student_module * 10000: 800000 + (student_module + 1) * 10000])
		else:
			data = raw_data
		print len(data)
		tagger = NLPlib.NLPlib()
		output = open(output_file, 'w')
		for row in data:
			output.write('<A=' + twtt9(row) + '>\n')
			output.write(twtt8(twtt7(twtt5(twtt4(twtt3(twtt2(twtt1(row[5])))))), tagger))
Exemple #12
0
#!/usr/bin/python
import os
import sys
import re
import NLPlib

abbr = {}
tagger = NLPlib.NLPlib()


def twtt1(tw):
    tw = re.sub(r'<[^>]+>', '', tw)
    return tw


def twtt2(tw):
    tw = tw.replace("&amp;",'&').replace("&lt;",'<').replace("&gt;",'>')\
       .replace("&quot;",'"').replace("&#39;","'")
    return tw


def twtt3(tw):
    tw = re.sub(r'([^\w\d])(http://|https://|www\.)[^\s\"]+', r'\1', tw)
    return tw


def twtt4(tw):
    tw = re.sub(r'[@#]([\w\d]+)', r'\1', tw)
    return tw

Exemple #13
0
def main(argv):
    infile = ''
    outfile = ''

    if len(argv) != 3:
        sys.exit('Usage: run twtt.py <input file> <output file>')
    else:
        infile = argv[1]
        outfile = argv[2]

    # Get file information
    str = file_op(infile, "r")

    # Get rid of html tags + attributes
    str = re.sub(r'<[^>]*>', '', str)

    # Replace html character codes with ASCII equivalent
    str = re.sub(r'(&#)([\d]+);', html_num_repl, str)
    str = re.sub(r'&[\w]+;', html_type_repl, str)

    #http://code.tutsplus.com/tutorials/8-regular-expressions-you-should-know--net-6149
    # All URLs are removed
    str = re.sub(r'\b(https?:\/\/)?([\da-z\.-]+)\.([a-z]{2,3})([/\w\.-]*)',
                 '',
                 str,
                 flags=re.I)

    # First character in usernames and hashtags removed
    str = re.sub(r'(@|#)([^\s]+)', user_hash_repl, str)

    # Separate tweets with a pipe symbol
    str = re.sub(r'\n+', '\n|\n', str)

    # Each sentence within a tweet is on its own line
    # Ending punctuation is padded by space
    # Note: Even if ending puncutation followed by lower case, treat as
    #   a sentence - tweets are often not gramatically correct
    str = re.sub(r'([\.\!\?]+)(?![a-zA-Z].)([\'\"]?)(\s*)', add_newline, str)
    # Split on semi-colons unless followed by digits (time)
    str = re.sub(r'(:+)(?!\d)([\'\"]?)(\s*)', add_newline, str)

    # Separate normal punctuation by spaces
    # Exclude the period because already separated earlier (ending punctuation)
    # Dashes need extra space so won't split on hyphens
    str = re.sub(r'(\s*)(,+|\!+|\?+|;+|\"+|\(+|\)+|\$+'\
    '|\#+| -+|-+ )(\s*)', sep_punc, str)
    # Split on semi-colons unless followed by digits (time)
    str = re.sub(r'(\s*)(:+)(?!\d)(\s*)', sep_punc, str)

    # Make it so abbreviations aren't on new lines
    # Also gets rid of spaces between abbreviation and period
    abvs = file_op('/u/cs401/Wordlists/pn_abbrev.english', "r")
    abvs = abvs + '\n' + file_op('/u/cs401/Wordlists/abbrev.english', "r")
    abvs = re.split(r'.\n+', abvs)
    str = re.sub(r'\b(' + '|'.join(abvs) + '\b)(\s*)(.)(\s*)(\n)',
                 undo_abv,
                 str,
                 flags=re.I)

    # Separate possessive apostrophe of plural
    str = re.sub(r'(\s*)(\')(\s+)', sep_punc, str)

    # Separate n't clitics
    str = re.sub(r'(\w+)(n\'t)', sep_clitic, str)

    # Separate other clitics
    str = re.sub(r'(\w*[^\WNn])(\'\w+)', sep_clitic, str)

    # Tag tokens
    tagger = NLPlib.NLPlib()
    str_list = re.split(r'[ \t\r\f\v]+', str)
    tag_list = tagger.tag(str_list)

    str = ''
    # Combine tokens with tags
    for val, tag in itertools.izip(str_list, tag_list):
        if val:
            str = str + val + '/' + tag + ' '

    # Output the info to a file
    file_op(outfile, "w", str)
Exemple #14
0
def load_abbreviations():
    abrv_1 = open("./Wordlists/abbrev.english", "r").read().split('\n')
    abrv_2 = open("./Wordlists/pn_abbrev.english", "r").read().split('\n')
    abrv_1.pop()
    abrv_2.pop()
    return  map(lambda x : x.lower(), abrv_1 + abrv_2)
    #TODO add U.S. to abbrivations
    #print abbreviations

if __name__ == "__main__":
    input_fpntr = open(sys.argv[1], "r")
    output_fpntr = open(sys.argv[2], "w")
    clitics = ["'m", "'re", "'s", "'ll", "'ve", "n't"]
    abbreviations =  load_abbreviations()
    tagger = NLPlib()
    
    for line in input_fpntr:
        #output_fpntr.write('ORG:' + line + '\n') #TODO: delete
        
        #substitute ...(ellipsis) with &hellip; to avoid multiple periods
        #line = re.sub("\.[ \t]?\.[ \t]?\.[ \t]?(?:\.[ \t]?)*", " &hellip; ", line)
        #substitute em dash multiple dashes
        #line = re.sub("--", " &mdash; ", line)
        #output_fpntr.write(line+"\n")
        line = remove_html_url(line)
        #output_fpntr.write(line+"\n")
        line = remove_html_special_char(line)
        #output_fpntr.write(line+"\n")
        line = separate_sentences(line)
        #output_fpntr.write(line)
Exemple #15
0
# This file parses the raw csv data into normalized form.

import NLPlib
import sys
import csv
import re
import codecs
import HTMLParser
import io

# WARNING: Change this before submitting.
ABBR_FILE = '/u/cs401/Wordlists/abbrev.english'
TAGGER = NLPlib.NLPlib()

CLASSES = [0, 4]
CLASS_INDICES = {0: 0, 4: 800000}
NUM_TRAIN = 11000

with open(ABBR_FILE) as f:
    ABBREVIATIONS = map(lambda x: x.rstrip('\n'), f.readlines())

def strip_html(tweet): # Step 1 of part 1.
     """
     Return the input tweet with all html tags and attributes
     removed.

     input:
        tweet - a string representing a tweet.
     output:
        tweet - a string representing a tweet.
     """
Exemple #16
0
def main(args):
    # For class 0, use lines [GID x 5500 ... (GID+1) x 5500 -1 ]
    # For class 4, use liens 800,000 + [GID x 5500 ... (GID+1) x 5500 -1]
    # my group_id = 100
    LINES_BTW_CLASS = 800000
    c0start = -1
    c0end = -1
    c4start = -1
    c4end = -1
    is_group_exist = False
    print len(args)
    ## arguments checking
    if (len(args) == 4):
        input_filename = args[1]
        output_filename = args[3]
        try:
            group_id = int(args[2])
            c0start = (group_id * 5500)
            c0end = ((group_id + 1) * 5500) - 1
            c4start = LINES_BTW_CLASS + c0start
            c4end = LINES_BTW_CLASS + c0end
            is_group_exist = True
        except ValueError:
            print "Parameter (%s) is not a numeric" % args[2]
    elif (len(args) == 3):
        # variables must be a stirngs. input and output
        input_filename = args[1]
        output_filename = args[2]
        group_id = -1
    else:
        print "Wrong number of arguments"
        print "Usage: python twtt.py <input_filename> <group_number> <output_filename>"
        sys.exit()

    print 'Number of arguments:', len(args), 'arguments.'
    print 'Input csv filename: ', input_filename, len(input_filename)
    if (group_id != -1):
        print 'Group ID: ', group_id
    print 'Output filename: ', output_filename

    ####
    # Read CSV file and Write the preprocessing results
    ####
    tagger = NLPlib.NLPlib()  # init tagger
    wfp = open(output_filename,
               "w")  # file pointer for writing result into outputfile
    count = 0
    with open(input_filename, 'r+') as f:
        reader = csv.reader(f)
        if (group_id != -1):  #group id is provided
            try:
                for i, row in enumerate(reader):
                    if (i >= c0start and i <= c0end):
                        count = count + 1
                        tweet = Tweet(row)
                        tweet.do_preprocess()
                        tweet.tagging(tagger)
                        result = tweet.printable_tweet()
                        print result
                        wfp.write(result + "\n")
                    elif (i >= c4start and i <= c4end):
                        count = count + 1
                        tweet = Tweet(row)
                        tweet.do_preprocess()
                        tweet.tagging(tagger)
                        result = tweet.printable_tweet()
                        print result
                        wfp.write(result + "\n")

            except csv.Error as e:
                sys.exit(" file %s, line %d: %s" %
                         (input_filename, reader.line_num, e))
        else:  # group _id is not provided, use all data
            try:
                for i, row in enumerate(reader):
                    tweet = Tweet(row)
                    tweet.do_preprocess()
                    tweet.tagging(tagger)
                    result = tweet.printable_tweet()
                    print result
                    wfp.write(result + "\n")
            except csv.Error as e:
                sys.exit(" file %s, line %d: %s" %
                         (input_filename, reader.line_num, e))
    print "Count is %s" % count
    wfp.close()
Exemple #17
0
__author__ = 'Shaham'
import nltk
from nltk.corpus import wordnet
from transcript import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn import neighbors
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
import random
from NLPlib import *

nlp_tag = NLPlib()
import matplotlib.pyplot as plt
import word2vec
import gensim

patternL = []
with open('Wordlists/laughs.txt', 'rb') as f:
    for word in f:
        if len(word) > 1:
            w = word[:-2]
            patternL.append(w)
            patternL.append('$')
            patternL.append('|')
pattern = "".join(patternL[:-1])
#print pattern
laughs = re.compile(pattern, re.IGNORECASE)

patternL = []
with open('Wordlists/Slang2', 'rb') as f: