def TagSentence(self, words, pos): if self.nTagged % 500 == 0: self.tagger.stdin.close() self.tagger.stdout.close() #self.tagger.kill() os.kill(self.tagger.pid, SIGTERM) #Need to do this for python 2.4 self.tagger.wait() self.GetTagger() features = [] seq_features = [] quotes = Features.GetQuotes(words) for i in range(len(words)): features = self.fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter'] if quotes[i]: features.append(u"QUOTED") seq_features.append(" ".join(features)) #print ("\t".join(seq_features) + "\n").encode('utf8') self.tagger.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) event_tags = [] for i in range(len(words)): event_tags.append(self.tagger.stdout.readline().rstrip('\n').strip(' ')) self.nTagged += 1 return event_tags
def PrintFeatures(sentences): for s in sentences: words = s.split(' ') pos = [x[1] for x in nltk.pos_tag(words)] tags = [] tag = None last = True for i in range(len(words)): mstart = re.search(r'^XXX([A-Z]+)-', words[i]) mend = re.search(r'-([A-Z]+)XXX$', words[i]) if mstart: tag = "B-%s" % mapTag(mstart.group(1)) words[i] = re.sub(r'^XXX([A-Z]+)-', '', words[i]) last = False if mend: if not mstart: tag = "I-%s" % mapTag(mend.group(1)) words[i] = re.sub(r'-([A-Z]+)XXX$', '', words[i]) last = True elif last: tag = "O" elif not last and not mstart: tag = tag.replace('B', 'I') #Just do entities (no person, loc, etc...) if "DATE" in tag or "TIME" in tag or "MONEY" in tag or "PERCENT" in tag: tag = "O" elif (tag[0] == 'B' or tag[0] == 'I') and not USE_TAGS: tag = tag[0] + "-ENTITY" tags.append(tag) quotes = Features.GetQuotes(words) #capFeatures = fe.ExtractCapFeatures(words) for i in range(len(words)): #features = fe.Extract(words, i) + ['DOMAIN=News'] + capFeatures features = fe.Extract(words, pos, i) + ['DOMAIN=News'] #features = fe.Extract(words, i) if quotes[i]: features.append("QUOTED") #print " ".join(features) + " " + tags[i] + "\n" print " ".join(features) + " " + tags[i] print
goodCap = capClassifier.Classify(tweetWords) > 0.9 pos = posTagger.TagSentence(tweetWords) pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights word_pos = zip(tweetWords, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights #print chunk events = eventTagger.TagSentence(tweetWords, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] #print events quotes = Features.GetQuotes(tweetWords) for i in range(len(tweetWords)): features = fe.Extract(tweetWords, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(tweetWords)): tags.append(ner.stdout.readline().rstrip('\n').strip(' ')) #print "Tags before" #print tags tweetTags = [] for i in range(len(tags)): tweetTags.append(tags[i])
def parse_lines(self, lines): res = [] for line in lines: # nLines = 1 line = line.encode('utf-8', "ignore") words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = self.capClassifier.Classify(words) > 0.9 if self.posTagger: pos = self.posTagger.TagSentence(words) pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights else: pos = None # Chunking the tweet if self.posTagger and self.chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = self.chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights else: chunk = None # Event tags if self.posTagger and self.eventTagger: events = self.eventTagger.TagSentence(words, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] else: events = None quotes = Features.GetQuotes(words) for i in range(len(words)): features = self.fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) self.ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(words)): tags.append(self.ner.stdout.readline().rstrip('\n').strip(' ')) features = LdaFeatures(words, tags) # Extract and classify entities for i in range(len(features.entities)): # type = None wids = [str(self.vocab.GetID(x.lower())) for x in features.features[i] if self.vocab.HasWord(x.lower())] if self.llda and len(wids) > 0: entityid = "-1" if self.entityMap.has_key(features.entityStrings[i].lower()): entityid = str(self.entityMap[features.entityStrings[i].lower()]) labels = self.dictionaries.GetDictVector(features.entityStrings[i]) if sum(labels) == 0: labels = [1 for _ in labels] self.llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n") sample = self.llda.stdout.readline().rstrip('\n') labels = [self.dict2label[self.dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ')] count = {} for label in labels: count[label] = count.get(label, 0.0) + 1.0 maxL = None maxP = 0.0 for label in count.keys(): p = count[label] / float(len(count)) if p > maxP or maxL == None: maxL = label maxP = p if maxL != 'None': tags[features.entities[i][0]] = "B-%s" % (maxL) for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-%s" % (maxL) else: tags[features.entities[i][0]] = "O" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "O" else: tags[features.entities[i][0]] = "B-ENTITY" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-ENTITY" output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))] if pos: output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))] if chunk: output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))] if events: output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))] res.append(" ".join(output)) # seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so # if nLines % 10000 == 0: # self.trigger_line_counter() self.line_counter() return res
def process_document(input_file, output_file): global totalLines, ner, capClassifier, posTagger, chunkTagger, eventTagger, ner_model, fe, vocab, dictMap, dict2index, dictionaries, entityMap, llda, dict2label print >> sys.stderr, "Start reading from %s and writing to %s" % ( input_file, output_file) out_fp = open(output_file, "wb+") with open(input_file) as fp: nLines = 0 # row = fp.readline().strip().split("\t") # tweet = row[options.text_pos] # line = tweet.encode('utf-8') while nLines == 0 or len(line) > 1: nLines += 1 totalLines += 1 row = fp.readline().strip().split("\t") if len(row) > 1: id_tweet = row[0] print >> out_fp, "[#ID#]\t" + id_tweet tweet = row[1] line = tweet.decode('utf-8', "ignore") else: line = '' if not line: print >> sys.stderr, "Finished reading %s lines from %s" % ( nLines - 1, input_file) break # print >> sys.stderr, "Read Line: %s, %s" % (nLines, line), words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 if posTagger: pos = posTagger.TagSentence(words) # pos = [p.split(':')[0] for p in pos] # remove weights pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights else: pos = None # Chunking the tweet if posTagger and chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights else: chunk = None # Event tags if posTagger and eventTagger: events = eventTagger.TagSentence( words, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] else: events = None quotes = Features.GetQuotes(words) for i in range(len(words)): features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(words)): tags.append(ner.stdout.readline().rstrip('\n').strip(' ')) features = LdaFeatures(words, tags) # Extract and classify entities for i in range(len(features.entities)): type = None wids = [ str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower()) ] if llda and len(wids) > 0: entityid = "-1" if entityMap.has_key(features.entityStrings[i].lower()): entityid = str( entityMap[features.entityStrings[i].lower()]) labels = dictionaries.GetDictVector( features.entityStrings[i]) if sum(labels) == 0: labels = [1 for x in labels] llda.stdin.write("\t".join([ entityid, " ".join(wids), " ".join( [str(x) for x in labels]) ]) + "\n") sample = llda.stdout.readline().rstrip('\n') labels = [ dict2label[dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ') ] count = {} for label in labels: count[label] = count.get(label, 0.0) + 1.0 maxL = None maxP = 0.0 for label in count.keys(): p = count[label] / float(len(count)) print >> out_fp, "[#ETS#]\t", features.entityStrings[ i].encode('utf-8'), "\t", label, "\t", p if p > maxP or maxL == None: maxL = label maxP = p if maxL != 'None': tags[features.entities[i][0]] = "B-%s" % (maxL) for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-%s" % (maxL) else: tags[features.entities[i][0]] = "O" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "O" else: tags[features.entities[i][0]] = "B-ENTITY" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-ENTITY" output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))] if pos: output = [ "%s/%s" % (output[x], pos[x]) for x in range(len(output)) ] if chunk: output = [ "%s/%s" % (output[x], chunk[x]) for x in range(len(output)) ] if events: output = [ "%s/%s" % (output[x], events[x]) for x in range(len(output)) ] #sys.stdout.write((" ".join(output) + "\n").encode('utf8')) row[1] = (" ".join(output)) #print >> out_fp, ("\t".join(row)).encode('utf8') print >> out_fp, "[#TWEET#]\t" + line.encode('utf-8') + "\n" #print >> sys.stderr, "\tWrote Line: %s, %s" % (nLines, row[options.text_pos]) # if pos: # sys.stdout.write((" ".join(["%s/%s/%s" % (words[x], tags[x], pos[x]) for x in range(len(words))]) + "\n").encode('utf8')) # else: # sys.stdout.write((" ".join(["%s/%s" % (words[x], tags[x]) for x in range(len(words))]) + "\n").encode('utf8')) #sys.stdout.flush() #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so # if totalLines % 1000 == 0: # start = time.time() # ner.stdin.close() # ner.stdout.close() # #if ner.wait() != 0: # #sys.stderr.write("error!\n") # #ner.kill() # os.kill(ner.pid, SIGTERM) #Need to do this for python 2.4 # ner.wait() # ner = GetNer(ner_model) # Close file out_fp.close()
#h = hpy() #print h.heap() #print "================================================" nLines = 1 for line in sys.stdin: line = unicode(line, errors='ignore') line = line.rstrip(u'\n') fields = line.split(u'\t') words = twokenize.tokenize(fields[6]) seq_features = [] tags = [] pos = fields[-2].split(u' ') quotes = Features.GetQuotes(words) for i in range(len(words)): features = fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter'] if quotes[i]: features.append(u"QUOTED") seq_features.append(" ".join(features)) ner.stdin.write("\t".join(seq_features) + u"\n") for i in range(len(words)): tags.append(ner.stdout.readline().rstrip(u'\n').strip(u' ')) print line + u"\t%s" % u" ".join(tags) sys.stdout.flush() #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so #if nLines % 1000 == 0:
def parseOneTweet(line): words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 if posTagger: pos = posTagger.TagSentence(words) #pos = [p.split(':')[0] for p in pos] # remove weights pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights else: pos = None # Chunking the tweet if posTagger and chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights else: chunk = None #Event tags if posTagger and eventTagger: events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] else: events = None quotes = Features.GetQuotes(words) for i in range(len(words)): features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(words)): tags.append(ner.stdout.readline().rstrip('\n').strip(' ')) features = LdaFeatures(words, tags) #Extract and classify entities for i in range(len(features.entities)): type = None wids = [ str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower()) ] if llda and len(wids) > 0: entityid = "-1" if entityMap.has_key(features.entityStrings[i].lower()): entityid = str(entityMap[features.entityStrings[i].lower()]) labels = dictionaries.GetDictVector(features.entityStrings[i]) if sum(labels) == 0: labels = [1 for x in labels] llda.stdin.write("\t".join( [entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n") sample = llda.stdout.readline().rstrip('\n') labels = [ dict2label[dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ') ] count = {} for label in labels: count[label] = count.get(label, 0.0) + 1.0 maxL = None maxP = 0.0 for label in count.keys(): p = count[label] / float(len(count)) if p > maxP or maxL == None: maxL = label maxP = p if maxL != 'None': tags[features.entities[i][0]] = "B-%s" % (maxL) for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-%s" % (maxL) else: tags[features.entities[i][0]] = "O" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "O" else: tags[features.entities[i][0]] = "B-ENTITY" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-ENTITY" output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))] if pos: output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))] if chunk: output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))] if events: output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))] return " ".join(output)
def tag_tweets(tweets): global ner tweets_count = 1 tagged_tweets = [] for line in tweets: record = dict() record.update({'tweet_text': line}) words = twokenize.tokenize(line) seq_features = [] tags = [] goodCap = capClassifier.Classify(words) > 0.9 if posTagger: pos = posTagger.TagSentence(words) #pos = [p.split(':')[0] for p in pos] # remove weights pos = [re.sub(r':[^:]*$', '', p) for p in pos] # remove weights else: pos = None # Chunking the tweet if posTagger and chunkTagger: word_pos = zip(words, [p.split(':')[0] for p in pos]) chunk = chunkTagger.TagSentence(word_pos) chunk = [c.split(':')[0] for c in chunk] # remove weights else: chunk = None #Event tags if posTagger and eventTagger: events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos]) events = [e.split(':')[0] for e in events] else: events = None quotes = Features.GetQuotes(words) for i in range(len(words)): features = fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") seq_features.append(" ".join(features)) ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8')) for i in range(len(words)): tags.append(ner.stdout.readline().rstrip('\n').strip(' ')) features = LdaFeatures(words, tags) #Extract and classify entities for i in range(len(features.entities)): type = None wids = [ str(vocab.GetID(x.lower())) for x in features.features[i] if vocab.HasWord(x.lower()) ] if llda and len(wids) > 0: entityid = "-1" if entityMap.has_key(features.entityStrings[i].lower()): entityid = str( entityMap[features.entityStrings[i].lower()]) labels = dictionaries.GetDictVector(features.entityStrings[i]) if sum(labels) == 0: labels = [1 for x in labels] llda.stdin.write("\t".join([ entityid, " ".join(wids), " ".join( [str(x) for x in labels]) ]) + "\n") sample = llda.stdout.readline().rstrip('\n') labels = [ dict2label[dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ') ] count = {} for label in labels: count[label] = count.get(label, 0.0) + 1.0 maxL = None maxP = 0.0 for label in count.keys(): p = count[label] / float(len(count)) if p > maxP or maxL == None: maxL = label maxP = p if maxL != 'None': tags[features.entities[i][0]] = "B-%s" % (maxL) for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-%s" % (maxL) else: tags[features.entities[i][0]] = "O" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "O" else: tags[features.entities[i][0]] = "B-ENTITY" for j in range(features.entities[i][0] + 1, features.entities[i][1]): tags[j] = "I-ENTITY" output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))] if pos: output = [ "%s/%s" % (output[x], pos[x]) for x in range(len(output)) ] if chunk: output = [ "%s/%s" % (output[x], chunk[x]) for x in range(len(output)) ] if events: output = [ "%s/%s" % (output[x], events[x]) for x in range(len(output)) ] ttweet = (" ".join(output) + "\n").encode('utf8') record.update({'tagged_tweet': ttweet}) if only_entities: entities = get_entities(ttweet) record.update({'entitites': entities}) tagged_tweets.append(record) #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so if tweets_count % 1000 == 0: start = time.time() ner.stdin.close() ner.stdout.close() #if ner.wait() != 0: #sys.stderr.write("error!\n") #ner.kill() os.kill(ner.pid, SIGTERM) #Need to do this for python 2.4 ner.wait() ner = GetNer(ner_model) tweets_count += 1 end_time = time.time() print "Average time per tweet = %ss" % (str( (end_time - start_time) / tweets_count)) return tagged_tweets
fields = line.split('\t') sid = fields[0] date = fields[9][0:10] confidence = 1.0 / float(fields[-1]) eType = fields[-2] entity = fields[-3] neTags = fields[-4].split(' ') pos = fields[-5].split(' ') words = fields[-6].split(' ') key = "%s\t%s\t%s" % (entity, eType, date) if prevSid and prevSid != sid and minConf and minConf > 0.9: goodCap = cap.Classify(prevWords) > 0.5 quotes = Features.GetQuotes(prevWords) for i in range(len(prevWords)): features = fe.Extract(prevWords, prevPos, i, goodCap) + ['DOMAIN=Twitter'] if quotes[i]: features.append("QUOTED") print " ".join(features) + " %s" % prevTags[i] print if prevSid != sid: minConf = None prevWords = words prevPos = pos prevTags = neTags
#Read in the sentences, and get output of capitalization classifier #cap_classifier = subprocess.Popen('python/cap/cap_classify data/cap/tweets_cap_labeled.csv.model', # shell=True, # stdin=subprocess.PIPE, # stdout=subprocess.PIPE) #for line in open(DATA_DIR + "/mmax_ner_sentence_level.xml"): for line in open(DATA_DIR + "/%s_sentence_level.xml" % (project_name)): line = line.rstrip('\n') m = re.match( r'<markable id="markable_(\d+)" span="word_(\d+)..word_(\d+)" mmax_level="[^"]+" />', line) if m: start = int(m.group(2)) - 1 end = int(m.group(3)) q = Features.GetQuotes(words[start:end]) #sys.stderr.write(str(words[start:end]) + "\n") #sys.stderr.write(str(q) + "\n") for i in range(len(q)): if q[i]: quotes[start + i] = 1 startSentence[start] = end endSentence[end - 1] = 1 sentences = [words[i:startSentence[i]] for i in startSentence.keys()] sentenceTags = [tags[i:startSentence[i]] for i in startSentence.keys()] posChunk = Tag(sentences) #Print out the data #posTagger = pos_tagger_stdin.PosTagger()