Python Features.GetQuotes 예제들, Features.GetQuotes, trading-server Python 예제들

예제 #1

0

파일 보기

    def TagSentence(self, words, pos):
        if self.nTagged % 500 == 0:
            self.tagger.stdin.close()
            self.tagger.stdout.close()
            #self.tagger.kill()
            os.kill(self.tagger.pid, SIGTERM)       #Need to do this for python 2.4
            self.tagger.wait()
            self.GetTagger()

        features = []
        seq_features = []
        quotes = Features.GetQuotes(words)
        for i in range(len(words)):
            features = self.fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter']
            if quotes[i]:
                features.append(u"QUOTED")
            seq_features.append(" ".join(features))

        #print ("\t".join(seq_features) + "\n").encode('utf8')
        self.tagger.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

        event_tags = []
        for i in range(len(words)):
            event_tags.append(self.tagger.stdout.readline().rstrip('\n').strip(' '))
        self.nTagged += 1
        return event_tags

예제 #2

0

파일 보기

def PrintFeatures(sentences):
    for s in sentences:
        words = s.split(' ')
        pos = [x[1] for x in nltk.pos_tag(words)]
        tags = []

        tag = None
        last = True
        for i in range(len(words)):
            mstart = re.search(r'^XXX([A-Z]+)-', words[i])
            mend = re.search(r'-([A-Z]+)XXX$', words[i])
            if mstart:
                tag = "B-%s" % mapTag(mstart.group(1))
                words[i] = re.sub(r'^XXX([A-Z]+)-', '', words[i])
                last = False
            if mend:
                if not mstart:
                    tag = "I-%s" % mapTag(mend.group(1))
                words[i] = re.sub(r'-([A-Z]+)XXX$', '', words[i])
                last = True
            elif last:
                tag = "O"
            elif not last and not mstart:
                tag = tag.replace('B', 'I')

            #Just do entities (no person, loc, etc...)
            if "DATE" in tag or "TIME" in tag or "MONEY" in tag or "PERCENT" in tag:
                tag = "O"
            elif (tag[0] == 'B' or tag[0] == 'I') and not USE_TAGS:
                tag = tag[0] + "-ENTITY"
            tags.append(tag)

        quotes = Features.GetQuotes(words)
        #capFeatures = fe.ExtractCapFeatures(words)
        for i in range(len(words)):
            #features = fe.Extract(words, i) + ['DOMAIN=News'] + capFeatures
            features = fe.Extract(words, pos, i) + ['DOMAIN=News']
            #features = fe.Extract(words, i)
            if quotes[i]:
                features.append("QUOTED")
            #print " ".join(features) + " " + tags[i] + "\n"
            print " ".join(features) + " " + tags[i]

        print

예제 #3

0

파일 보기

        
    goodCap = capClassifier.Classify(tweetWords) > 0.9

    pos = posTagger.TagSentence(tweetWords)
    pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights

    word_pos = zip(tweetWords, [p.split(':')[0] for p in pos])
    chunk = chunkTagger.TagSentence(word_pos)
    chunk = [c.split(':')[0] for c in chunk]  # remove weights
    #print chunk

    events = eventTagger.TagSentence(tweetWords, [p.split(':')[0] for p in pos])
    events = [e.split(':')[0] for e in events]
    #print events

    quotes = Features.GetQuotes(tweetWords)
    for i in range(len(tweetWords)):
        features = fe.Extract(tweetWords, pos, chunk, i, goodCap) + ['DOMAIN=Twitter']
        if quotes[i]:
            features.append("QUOTED")
        seq_features.append(" ".join(features))
    ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))


    for i in range(len(tweetWords)):
        tags.append(ner.stdout.readline().rstrip('\n').strip(' '))
    #print "Tags before"
    #print tags
    tweetTags = []
    for i in range(len(tags)):
        tweetTags.append(tags[i])

예제 #4

0

파일 보기

파일: extractEntitiesMod.py 프로젝트: locta66/TweetEventDetection

 def parse_lines(self, lines):
     res = []
     for line in lines:
         # nLines = 1
         line = line.encode('utf-8', "ignore")
         words = twokenize.tokenize(line)
         seq_features = []
         tags = []
         
         goodCap = self.capClassifier.Classify(words) > 0.9
         
         if self.posTagger:
             pos = self.posTagger.TagSentence(words)
             pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights
         else:
             pos = None
         
         # Chunking the tweet
         if self.posTagger and self.chunkTagger:
             word_pos = zip(words, [p.split(':')[0] for p in pos])
             chunk = self.chunkTagger.TagSentence(word_pos)
             chunk = [c.split(':')[0] for c in chunk]  # remove weights
         else:
             chunk = None
         
         # Event tags
         if self.posTagger and self.eventTagger:
             events = self.eventTagger.TagSentence(words, [p.split(':')[0] for p in pos])
             events = [e.split(':')[0] for e in events]
         else:
             events = None
         
         quotes = Features.GetQuotes(words)
         for i in range(len(words)):
             features = self.fe.Extract(words, pos, chunk, i, goodCap) + ['DOMAIN=Twitter']
             if quotes[i]:
                 features.append("QUOTED")
             seq_features.append(" ".join(features))
         self.ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))
         
         for i in range(len(words)):
             tags.append(self.ner.stdout.readline().rstrip('\n').strip(' '))
         features = LdaFeatures(words, tags)
         
         # Extract and classify entities
         for i in range(len(features.entities)):
             # type = None
             wids = [str(self.vocab.GetID(x.lower())) for x in features.features[i] if self.vocab.HasWord(x.lower())]
             if self.llda and len(wids) > 0:
                 entityid = "-1"
                 if self.entityMap.has_key(features.entityStrings[i].lower()):
                     entityid = str(self.entityMap[features.entityStrings[i].lower()])
                 labels = self.dictionaries.GetDictVector(features.entityStrings[i])
                 
                 if sum(labels) == 0:
                     labels = [1 for _ in labels]
                 self.llda.stdin.write("\t".join([entityid, " ".join(wids), " ".join([str(x) for x in labels])]) + "\n")
                 sample = self.llda.stdout.readline().rstrip('\n')
                 labels = [self.dict2label[self.dictMap[int(x)]] for x in sample[4:len(sample) - 8].split(' ')]
                 
                 count = {}
                 for label in labels:
                     count[label] = count.get(label, 0.0) + 1.0
                 maxL = None
                 maxP = 0.0
                 for label in count.keys():
                     p = count[label] / float(len(count))
                     if p > maxP or maxL == None:
                         maxL = label
                         maxP = p
                 
                 if maxL != 'None':
                     tags[features.entities[i][0]] = "B-%s" % (maxL)
                     for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                         tags[j] = "I-%s" % (maxL)
                 else:
                     tags[features.entities[i][0]] = "O"
                     for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                         tags[j] = "O"
             else:
                 tags[features.entities[i][0]] = "B-ENTITY"
                 for j in range(features.entities[i][0] + 1, features.entities[i][1]):
                     tags[j] = "I-ENTITY"
         
         output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
         if pos:
             output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))]
         if chunk:
             output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))]
         if events:
             output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))]
         res.append(" ".join(output))
         
         # seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so
         # if nLines % 10000 == 0:
         #     self.trigger_line_counter()
         self.line_counter()
         
     return res

예제 #5

0

파일 보기

파일: extractEntitiesLoop.py 프로젝트: UNIMIBInside/bigtwine-ner-tool

def process_document(input_file, output_file):
    global totalLines, ner, capClassifier, posTagger, chunkTagger, eventTagger, ner_model, fe, vocab, dictMap, dict2index, dictionaries, entityMap, llda, dict2label

    print >> sys.stderr, "Start reading from %s and writing to %s" % (
        input_file, output_file)

    out_fp = open(output_file, "wb+")
    with open(input_file) as fp:
        nLines = 0
        # row = fp.readline().strip().split("\t")
        # tweet = row[options.text_pos]
        # line = tweet.encode('utf-8')
        while nLines == 0 or len(line) > 1:
            nLines += 1
            totalLines += 1
            row = fp.readline().strip().split("\t")
            if len(row) > 1:
                id_tweet = row[0]
                print >> out_fp, "[#ID#]\t" + id_tweet
                tweet = row[1]
                line = tweet.decode('utf-8', "ignore")
            else:
                line = ''
            if not line:
                print >> sys.stderr, "Finished reading %s lines from %s" % (
                    nLines - 1, input_file)
                break
            # print >> sys.stderr, "Read Line: %s, %s" % (nLines, line),
            words = twokenize.tokenize(line)
            seq_features = []
            tags = []

            goodCap = capClassifier.Classify(words) > 0.9

            if posTagger:
                pos = posTagger.TagSentence(words)
                # pos = [p.split(':')[0] for p in pos]  # remove weights
                pos = [re.sub(r':[^:]*$', '', p)
                       for p in pos]  # remove weights
            else:
                pos = None

            # Chunking the tweet
            if posTagger and chunkTagger:
                word_pos = zip(words, [p.split(':')[0] for p in pos])
                chunk = chunkTagger.TagSentence(word_pos)
                chunk = [c.split(':')[0] for c in chunk]  # remove weights
            else:
                chunk = None

            # Event tags
            if posTagger and eventTagger:
                events = eventTagger.TagSentence(
                    words, [p.split(':')[0] for p in pos])
                events = [e.split(':')[0] for e in events]
            else:
                events = None

            quotes = Features.GetQuotes(words)
            for i in range(len(words)):
                features = fe.Extract(words, pos, chunk, i,
                                      goodCap) + ['DOMAIN=Twitter']
                if quotes[i]:
                    features.append("QUOTED")
                seq_features.append(" ".join(features))
            ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

            for i in range(len(words)):
                tags.append(ner.stdout.readline().rstrip('\n').strip(' '))

            features = LdaFeatures(words, tags)

            # Extract and classify entities
            for i in range(len(features.entities)):
                type = None
                wids = [
                    str(vocab.GetID(x.lower())) for x in features.features[i]
                    if vocab.HasWord(x.lower())
                ]
                if llda and len(wids) > 0:
                    entityid = "-1"
                    if entityMap.has_key(features.entityStrings[i].lower()):
                        entityid = str(
                            entityMap[features.entityStrings[i].lower()])
                    labels = dictionaries.GetDictVector(
                        features.entityStrings[i])

                    if sum(labels) == 0:
                        labels = [1 for x in labels]
                    llda.stdin.write("\t".join([
                        entityid, " ".join(wids), " ".join(
                            [str(x) for x in labels])
                    ]) + "\n")
                    sample = llda.stdout.readline().rstrip('\n')
                    labels = [
                        dict2label[dictMap[int(x)]]
                        for x in sample[4:len(sample) - 8].split(' ')
                    ]

                    count = {}
                    for label in labels:
                        count[label] = count.get(label, 0.0) + 1.0
                    maxL = None
                    maxP = 0.0
                    for label in count.keys():
                        p = count[label] / float(len(count))
                        print >> out_fp, "[#ETS#]\t", features.entityStrings[
                            i].encode('utf-8'), "\t", label, "\t", p
                        if p > maxP or maxL == None:
                            maxL = label
                            maxP = p

                    if maxL != 'None':
                        tags[features.entities[i][0]] = "B-%s" % (maxL)
                        for j in range(features.entities[i][0] + 1,
                                       features.entities[i][1]):
                            tags[j] = "I-%s" % (maxL)
                    else:
                        tags[features.entities[i][0]] = "O"
                        for j in range(features.entities[i][0] + 1,
                                       features.entities[i][1]):
                            tags[j] = "O"
                else:
                    tags[features.entities[i][0]] = "B-ENTITY"
                    for j in range(features.entities[i][0] + 1,
                                   features.entities[i][1]):
                        tags[j] = "I-ENTITY"

            output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
            if pos:
                output = [
                    "%s/%s" % (output[x], pos[x]) for x in range(len(output))
                ]
            if chunk:
                output = [
                    "%s/%s" % (output[x], chunk[x]) for x in range(len(output))
                ]
            if events:
                output = [
                    "%s/%s" % (output[x], events[x])
                    for x in range(len(output))
                ]
            #sys.stdout.write((" ".join(output) + "\n").encode('utf8'))
            row[1] = (" ".join(output))
            #print >> out_fp, ("\t".join(row)).encode('utf8')
            print >> out_fp, "[#TWEET#]\t" + line.encode('utf-8') + "\n"
            #print >> sys.stderr, "\tWrote Line: %s, %s" % (nLines, row[options.text_pos])

        #    if pos:
        #        sys.stdout.write((" ".join(["%s/%s/%s" % (words[x], tags[x], pos[x]) for x in range(len(words))]) + "\n").encode('utf8'))
        #    else:
        #        sys.stdout.write((" ".join(["%s/%s" % (words[x], tags[x]) for x in range(len(words))]) + "\n").encode('utf8'))

        #sys.stdout.flush()

        #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so
        # if totalLines % 1000 == 0:
        #     start = time.time()
        #     ner.stdin.close()
        #     ner.stdout.close()
        #     #if ner.wait() != 0:
        #     #sys.stderr.write("error!\n")
        #     #ner.kill()
        #     os.kill(ner.pid, SIGTERM)       #Need to do this for python 2.4
        #     ner.wait()
        #     ner = GetNer(ner_model)

    # Close file
    out_fp.close()

예제 #6

0

파일 보기

#h = hpy()
#print h.heap()
#print "================================================"

nLines = 1
for line in sys.stdin:
    line = unicode(line, errors='ignore')
    line = line.rstrip(u'\n')
    fields = line.split(u'\t')
    words = twokenize.tokenize(fields[6])
    seq_features = []
    tags = []

    pos = fields[-2].split(u' ')

    quotes = Features.GetQuotes(words)
    for i in range(len(words)):
        features = fe.Extract(words, pos, None, i, False) + [u'DOMAIN=Twitter']
        if quotes[i]:
            features.append(u"QUOTED")
        seq_features.append(" ".join(features))
    ner.stdin.write("\t".join(seq_features) + u"\n")
        
    for i in range(len(words)):
        tags.append(ner.stdout.readline().rstrip(u'\n').strip(u' '))

    print line + u"\t%s" % u" ".join(tags)
    sys.stdout.flush()

    #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so
    #if nLines % 1000 == 0:

예제 #7

0

파일 보기

파일: keyang.py 프로젝트: JinyiLu/EventStructureLearning

def parseOneTweet(line):
    words = twokenize.tokenize(line)
    seq_features = []
    tags = []

    goodCap = capClassifier.Classify(words) > 0.9

    if posTagger:
        pos = posTagger.TagSentence(words)
        #pos = [p.split(':')[0] for p in pos]  # remove weights
        pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights
    else:
        pos = None

    # Chunking the tweet
    if posTagger and chunkTagger:
        word_pos = zip(words, [p.split(':')[0] for p in pos])
        chunk = chunkTagger.TagSentence(word_pos)
        chunk = [c.split(':')[0] for c in chunk]  # remove weights
    else:
        chunk = None

    #Event tags
    if posTagger and eventTagger:
        events = eventTagger.TagSentence(words, [p.split(':')[0] for p in pos])
        events = [e.split(':')[0] for e in events]
    else:
        events = None

    quotes = Features.GetQuotes(words)
    for i in range(len(words)):
        features = fe.Extract(words, pos, chunk, i,
                              goodCap) + ['DOMAIN=Twitter']
        if quotes[i]:
            features.append("QUOTED")
        seq_features.append(" ".join(features))
    ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

    for i in range(len(words)):
        tags.append(ner.stdout.readline().rstrip('\n').strip(' '))

    features = LdaFeatures(words, tags)

    #Extract and classify entities
    for i in range(len(features.entities)):
        type = None
        wids = [
            str(vocab.GetID(x.lower())) for x in features.features[i]
            if vocab.HasWord(x.lower())
        ]
        if llda and len(wids) > 0:
            entityid = "-1"
            if entityMap.has_key(features.entityStrings[i].lower()):
                entityid = str(entityMap[features.entityStrings[i].lower()])
            labels = dictionaries.GetDictVector(features.entityStrings[i])

            if sum(labels) == 0:
                labels = [1 for x in labels]
            llda.stdin.write("\t".join(
                [entityid, " ".join(wids), " ".join([str(x)
                                                     for x in labels])]) +
                             "\n")
            sample = llda.stdout.readline().rstrip('\n')
            labels = [
                dict2label[dictMap[int(x)]]
                for x in sample[4:len(sample) - 8].split(' ')
            ]

            count = {}
            for label in labels:
                count[label] = count.get(label, 0.0) + 1.0
            maxL = None
            maxP = 0.0
            for label in count.keys():
                p = count[label] / float(len(count))
                if p > maxP or maxL == None:
                    maxL = label
                    maxP = p

            if maxL != 'None':
                tags[features.entities[i][0]] = "B-%s" % (maxL)
                for j in range(features.entities[i][0] + 1,
                               features.entities[i][1]):
                    tags[j] = "I-%s" % (maxL)
            else:
                tags[features.entities[i][0]] = "O"
                for j in range(features.entities[i][0] + 1,
                               features.entities[i][1]):
                    tags[j] = "O"
        else:
            tags[features.entities[i][0]] = "B-ENTITY"
            for j in range(features.entities[i][0] + 1,
                           features.entities[i][1]):
                tags[j] = "I-ENTITY"

    output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
    if pos:
        output = ["%s/%s" % (output[x], pos[x]) for x in range(len(output))]
    if chunk:
        output = ["%s/%s" % (output[x], chunk[x]) for x in range(len(output))]
    if events:
        output = ["%s/%s" % (output[x], events[x]) for x in range(len(output))]
    return " ".join(output)

예제 #8

0

파일 보기

파일: extractEntities2.py 프로젝트: rharige/twitter_nlp

def tag_tweets(tweets):
    global ner
    tweets_count = 1
    tagged_tweets = []
    for line in tweets:
        record = dict()
        record.update({'tweet_text': line})
        words = twokenize.tokenize(line)
        seq_features = []
        tags = []

        goodCap = capClassifier.Classify(words) > 0.9

        if posTagger:
            pos = posTagger.TagSentence(words)
            #pos = [p.split(':')[0] for p in pos]  # remove weights
            pos = [re.sub(r':[^:]*$', '', p) for p in pos]  # remove weights
        else:
            pos = None

        # Chunking the tweet
        if posTagger and chunkTagger:
            word_pos = zip(words, [p.split(':')[0] for p in pos])
            chunk = chunkTagger.TagSentence(word_pos)
            chunk = [c.split(':')[0] for c in chunk]  # remove weights
        else:
            chunk = None

        #Event tags
        if posTagger and eventTagger:
            events = eventTagger.TagSentence(words,
                                             [p.split(':')[0] for p in pos])
            events = [e.split(':')[0] for e in events]
        else:
            events = None

        quotes = Features.GetQuotes(words)
        for i in range(len(words)):
            features = fe.Extract(words, pos, chunk, i,
                                  goodCap) + ['DOMAIN=Twitter']
            if quotes[i]:
                features.append("QUOTED")
            seq_features.append(" ".join(features))
        ner.stdin.write(("\t".join(seq_features) + "\n").encode('utf8'))

        for i in range(len(words)):
            tags.append(ner.stdout.readline().rstrip('\n').strip(' '))

        features = LdaFeatures(words, tags)

        #Extract and classify entities
        for i in range(len(features.entities)):
            type = None
            wids = [
                str(vocab.GetID(x.lower())) for x in features.features[i]
                if vocab.HasWord(x.lower())
            ]
            if llda and len(wids) > 0:
                entityid = "-1"
                if entityMap.has_key(features.entityStrings[i].lower()):
                    entityid = str(
                        entityMap[features.entityStrings[i].lower()])
                labels = dictionaries.GetDictVector(features.entityStrings[i])

                if sum(labels) == 0:
                    labels = [1 for x in labels]
                llda.stdin.write("\t".join([
                    entityid, " ".join(wids), " ".join(
                        [str(x) for x in labels])
                ]) + "\n")
                sample = llda.stdout.readline().rstrip('\n')
                labels = [
                    dict2label[dictMap[int(x)]]
                    for x in sample[4:len(sample) - 8].split(' ')
                ]

                count = {}
                for label in labels:
                    count[label] = count.get(label, 0.0) + 1.0
                maxL = None
                maxP = 0.0
                for label in count.keys():
                    p = count[label] / float(len(count))
                    if p > maxP or maxL == None:
                        maxL = label
                        maxP = p

                if maxL != 'None':
                    tags[features.entities[i][0]] = "B-%s" % (maxL)
                    for j in range(features.entities[i][0] + 1,
                                   features.entities[i][1]):
                        tags[j] = "I-%s" % (maxL)
                else:
                    tags[features.entities[i][0]] = "O"
                    for j in range(features.entities[i][0] + 1,
                                   features.entities[i][1]):
                        tags[j] = "O"
            else:
                tags[features.entities[i][0]] = "B-ENTITY"
                for j in range(features.entities[i][0] + 1,
                               features.entities[i][1]):
                    tags[j] = "I-ENTITY"

        output = ["%s/%s" % (words[x], tags[x]) for x in range(len(words))]
        if pos:
            output = [
                "%s/%s" % (output[x], pos[x]) for x in range(len(output))
            ]
        if chunk:
            output = [
                "%s/%s" % (output[x], chunk[x]) for x in range(len(output))
            ]
        if events:
            output = [
                "%s/%s" % (output[x], events[x]) for x in range(len(output))
            ]

        ttweet = (" ".join(output) + "\n").encode('utf8')
        record.update({'tagged_tweet': ttweet})

        if only_entities:
            entities = get_entities(ttweet)
            record.update({'entitites': entities})
        tagged_tweets.append(record)

        #seems like there is a memory leak comming from mallet, so just restart it every 1,000 tweets or so
        if tweets_count % 1000 == 0:
            start = time.time()
            ner.stdin.close()
            ner.stdout.close()
            #if ner.wait() != 0:
            #sys.stderr.write("error!\n")
            #ner.kill()
            os.kill(ner.pid, SIGTERM)  #Need to do this for python 2.4
            ner.wait()
            ner = GetNer(ner_model)
        tweets_count += 1

    end_time = time.time()
    print "Average time per tweet = %ss" % (str(
        (end_time - start_time) / tweets_count))
    return tagged_tweets

예제 #9

0

파일 보기

파일: tempRedundMallet.py 프로젝트: heshenghuan/Twitter-NER

    fields = line.split('\t')

    sid = fields[0]
    date = fields[9][0:10]
    confidence = 1.0 / float(fields[-1])
    eType = fields[-2]
    entity = fields[-3]
    neTags = fields[-4].split(' ')
    pos = fields[-5].split(' ')
    words = fields[-6].split(' ')

    key = "%s\t%s\t%s" % (entity, eType, date)

    if prevSid and prevSid != sid and minConf and minConf > 0.9:
        goodCap = cap.Classify(prevWords) > 0.5
        quotes = Features.GetQuotes(prevWords)

        for i in range(len(prevWords)):
            features = fe.Extract(prevWords, prevPos, i,
                                  goodCap) + ['DOMAIN=Twitter']
            if quotes[i]:
                features.append("QUOTED")
            print " ".join(features) + " %s" % prevTags[i]
        print

    if prevSid != sid:
        minConf = None

    prevWords = words
    prevPos = pos
    prevTags = neTags

예제 #10

0

파일 보기

파일: mmax2mallet.py 프로젝트: heshenghuan/Twitter-NER

    #Read in the sentences, and get output of capitalization classifier
    #cap_classifier = subprocess.Popen('python/cap/cap_classify data/cap/tweets_cap_labeled.csv.model',
    #                                  shell=True,
    #                                  stdin=subprocess.PIPE,
    #                                  stdout=subprocess.PIPE)
    #for line in open(DATA_DIR + "/mmax_ner_sentence_level.xml"):
    for line in open(DATA_DIR + "/%s_sentence_level.xml" % (project_name)):
        line = line.rstrip('\n')
        m = re.match(
            r'<markable id="markable_(\d+)" span="word_(\d+)..word_(\d+)" mmax_level="[^"]+" />',
            line)
        if m:
            start = int(m.group(2)) - 1
            end = int(m.group(3))

            q = Features.GetQuotes(words[start:end])
            #sys.stderr.write(str(words[start:end]) + "\n")
            #sys.stderr.write(str(q) + "\n")
            for i in range(len(q)):
                if q[i]:
                    quotes[start + i] = 1

            startSentence[start] = end
            endSentence[end - 1] = 1

    sentences = [words[i:startSentence[i]] for i in startSentence.keys()]
    sentenceTags = [tags[i:startSentence[i]] for i in startSentence.keys()]
    posChunk = Tag(sentences)

    #Print out the data
    #posTagger = pos_tagger_stdin.PosTagger()