def RunPOSTagger(self): twitter_messages = self.tapi.GetPublicTimeline() for message in twitter_messages: try: cmd = 'echo "' + message.text + '" | treetagger/cmd/tree-tagger-english > ./twitter_message_output.txt' os.system(cmd) self.pos_file = open('twitter_message_output.txt', 'r') tokens = [] self.parse_string = "" for line in self.pos_file: current_line = [] self.parse_string += line + "<BR>" for value in tokenize.whitespace(line): current_line.append(value) tokens.append(current_line) filename = uuid.uuid4() self.output_file = open(str(filename)+".html", 'w') self.output_file.write(file_header % (message.text)) self.output_file.write(message.text + "<BR>") self.RetreiveFlickrURLs(tokens) self.output_file.write(file_footer) self.output_file.close() self.output_file = open(str(filename)+".html", 'r') self.ftp_socket.storlines("STOR "+str(filename)+".html", self.output_file) self.output_file.close() self.pos_file.close() time.sleep(30) except UnicodeEncodeError: print "Twitter Message not ascii, skipping" except AttributeError: print "Weird XML error. I wish it'd stop doing that"
def processWhitespacesWithoutStopWords(self, corpus, caseSensitive): from nltk import tokenize # initialise token buffer tokens = [] # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching whitespaces whitespaces = re.compile(r'\s\ \;') # go through each token in corpus for token in tokenizedCorpus: # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning token = whitespaces.sub('', token) # append token to list tokens.append(token) # return tokens return tokens
def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength=3, maximumTokenLength=25): from nltk import tokenize # initialise token list tokens = [] # initialise token buffer tokenBuffer = '' # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching special characters specialCharacters = re.compile(r'\&.+\;') # compile regular expression for matching whitespaces whitespaces = re.compile(r'\s|\ \;') # compile regular expression for sentence-boundary matching sentenceBoundary = re.compile(r'[\.\:\!\?\,]') # go through each token in corpus for token in tokenizedCorpus: # get token length tokenLength = len(token) # see, if token contains special character specialCharacterMatches = specialCharacters.findall(token) # reduce special characters to size one if specialCharacterMatches: for match in specialCharacterMatches: tokenLength -= (len(match) - 1) # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning and end token = whitespaces.sub('', token) # write token to buffer and remove punctuation tokenBuffer = sentenceBoundary.sub('', token) # mark stop words if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower( ) in stopWordList: tokens.append(token + '<STOPWORD>') else: tokens.append(token) # return tokens return tokens
def processWhitespacesWithoutStopWords(self, corpus, caseSensitive): from nltk import tokenize # initialise token buffer tokens = [] # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching whitespaces whitespaces = re.compile(r"\s\ \;") # go through each token in corpus for token in tokenizedCorpus: # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning token = whitespaces.sub("", token) # append token to list tokens.append(token) # return tokens return tokens
def GetTagList(self, text): self._before = text tags = [] [tags.append(i) for i in tokenize.whitespace(text)] self._output = "" self._after = tags return tags
def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength=3, maximumTokenLength=25): from nltk import tokenize # initialise token list tokens = [] # initialise token buffer tokenBuffer = "" # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching special characters specialCharacters = re.compile(r"\&.+\;") # compile regular expression for matching whitespaces whitespaces = re.compile(r"\s|\ \;") # compile regular expression for sentence-boundary matching sentenceBoundary = re.compile(r"[\.\:\!\?\,]") # go through each token in corpus for token in tokenizedCorpus: # get token length tokenLength = len(token) # see, if token contains special character specialCharacterMatches = specialCharacters.findall(token) # reduce special characters to size one if specialCharacterMatches: for match in specialCharacterMatches: tokenLength -= len(match) - 1 # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning and end token = whitespaces.sub("", token) # write token to buffer and remove punctuation tokenBuffer = sentenceBoundary.sub("", token) # mark stop words if ( tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower() in stopWordList ): tokens.append(token + "<STOPWORD>") else: tokens.append(token) # return tokens return tokens
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg))] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), cfg.Production(Det, ('a',)), cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) def lexicon(word): return earley_lexicon.get(word.upper(), []) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from nltk import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def _Run(self, text): self._output = "" cmd = "echo \"%s\" | treetagger/cmd/tree-tagger-english > ./twitter_message_output.txt" % text os.system(cmd) pos_file = open('twitter_message_output.txt', 'r') tokens = [] self.parse_string = "" for line in pos_file: current_line = [] self._output += line for value in tokenize.whitespace(line): current_line.append(value) tokens.append(current_line) self._output = self._output.replace("<unknown>", "[unknown]") filtered_tags = filter(self._ComparisonFunction, tokens) final_tags = [] [final_tags.append(i[0]) for i in filtered_tags] return final_tags
def getNGramStructure(sourceFile): # initialise n-gram dictionary ngrams = {} # read file corpus = sourceFile.read() # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # go through each token for token in tokenizedCorpus: # split token in single characters characters = list(token) # copy character list charactersBuffer = list(characters) # initialise buffer buffer1 = '' # go through character list for char1 in characters: # write each n-gram to list buffer1 += char1 ngrams[buffer1] = ngrams.get(buffer1, 0) + 1 # shift from character list copy charactersBuffer.pop(0) # initialise buffer buffer2 = '' # go through copy of character list for char2 in charactersBuffer: buffer2 += char2 ngrams[buffer2] = ngrams.get(buffer2, 0) + 1 # return n-grams return ngrams
def tabtagged(files='chunked', basedir=None): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @return: iterator over lines in Malt-TAB input format """ if type(files) is str: files = (files, ) if not basedir: basedir = get_basedir() for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): if (t != '[' and t != ']'): l.append(tag2tab(t)) #add a blank line as sentence separator l.append('\n') yield l
def tabtagged(files = 'chunked', basedir= None): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @return: iterator over lines in Malt-TAB input format """ if type(files) is str: files = (files,) if not basedir: basedir = get_basedir() for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): if (t != '[' and t != ']'): l.append(tag2tab(t)) #add a blank line as sentence separator l.append('\n') yield l
def pos_tag(self, infile): train_sents = list(islice(brown.tagged(), 1000000)) trigram_tagger = tag.Trigram() trigram_tagger.train(train_sents) file = open(infile + ".txt", "r") out = open(infile + "-tag.txt", "w") try: text = file.read() lines = string.split(text, '\n') for line in lines: tokens = list(tokenize.whitespace(line)) tagged = list(trigram_tagger.tag(tokens)) for tags in tagged: print tags if tags[1] == None: out.write(tags[0] + "/" + "NA") else: out.write(tags[0] + "/" + tags[1]) out.write(" ") out.write("\n") except IOError: raise IOError() file.close() out.close()
# initialise co-occurrence matrix coOccurrences = {} # open file file = open(path + category, 'r') # add each line to corpus for line in file: corpus += line # close file pointer file.close() # get tokens from corpus tokenizedCorpus = tokenize.whitespace(corpus) # go through tokens for token in tokenizedCorpus: # add token to sentence words.append(tag.sub('', token)) # if sentence-boundary has been found in this token if sentenceBoundary.findall(token): # recompose sentence for word in words: sentenceString += word + ' ' # add to sentence string list sentences.append(sentenceString)
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V, )), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)) ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John', )), cfg.Production(NP, ('I', )), cfg.Production(Det, ('the', )), cfg.Production(Det, ('my', )), cfg.Production(Det, ('a', )), cfg.Production(NSg, ('dog', )), cfg.Production(NSg, ('cookie', )), cfg.Production(V, ('ate', )), cfg.Production(V, ('saw', )), cfg.Production(P, ('with', )), cfg.Production(P, ('under', )), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) def lexicon(word): return earley_lexicon.get(word.upper(), []) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from nltk import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def demo(): import sys, time S = GrammarCategory.parse("S") VP = GrammarCategory.parse("VP") NP = GrammarCategory.parse("NP") PP = GrammarCategory.parse("PP") V = GrammarCategory.parse("V") N = GrammarCategory.parse("N") P = GrammarCategory.parse("P") Name = GrammarCategory.parse("Name") Det = GrammarCategory.parse("Det") DetSg = GrammarCategory.parse("Det[-pl]") DetPl = GrammarCategory.parse("Det[+pl]") NSg = GrammarCategory.parse("N[-pl]") NPl = GrammarCategory.parse("N[+pl]") # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg)), ] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ("John",)), cfg.Production(NP, ("I",)), cfg.Production(Det, ("the",)), cfg.Production(Det, ("my",)), cfg.Production(Det, ("a",)), cfg.Production(NSg, ("dog",)), cfg.Production(NSg, ("cookie",)), cfg.Production(V, ("ate",)), cfg.Production(V, ("saw",)), cfg.Production(P, ("with",)), cfg.Production(P, ("under",)), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) def lexicon(word): return earley_lexicon.get(word.upper(), []) sent = "I saw John with a dog with my cookie" print "Sentence:\n", sent from nltk import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, lexicon, trace=1) trees = cp.get_parse_list(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree