Example #1
0
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir):
    #  load feature mapping and label mapping
    feature_map = load_map(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=False, brown_file=brown_file, feature_mapping=feature_map, label_mapping=label_map)
    count = 0
    gx = open(outdir+'/test_x.txt', 'w')
    gy = open(outdir+'/test_y.txt', 'w')

    print 'start test feature generation'
    while reader.has_next():
            
        if count%1000 == 0:
            print count
        sentence = reader.next()
        for mention in sentence.mentions:
            try:
                m_id = '%s_%d_%d_%d'%(sentence.fileid, sentence.senid, mention.start, mention.end)
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                if len(label_ids)>0:
                    gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n')
                    gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n')
                    count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.fileid, sentence.senid
                print sentence
                continue
    print count
    reader.close()
    gx.close()
    gy.close()
Example #2
0
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir):
    #  load feature mapping and label mapping
    feature_map = load_map(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=False,
                             brown_file=brown_file,
                             feature_mapping=feature_map,
                             label_mapping=label_map)
    count = 0
    gx = open(outdir + '/test_x.txt', 'w')
    gy = open(outdir + '/test_y.txt', 'w')
    '''
    map from mention_id to text, saved in "mention_text.map"
    '''

    gz = open(outdir + '/mention_text.map', 'w')

    print 'start test feature generation'
    while reader.has_next():

        if count % 1000 == 0:
            print count
        sentence = reader.next()
        print '!!!!!!'
        for mention in sentence.mentions:
            print '@@@@@!!!!!!'
            try:
                m_id = '%s_%d_%d_%d' % (sentence.fileid, sentence.senid,
                                        mention.start, mention.end)
                print '$$$$$!!!!!!'
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                if len(label_ids) > 0:
                    print '^^^^^^^!!!!!!'
                    gx.write(m_id + '\t' +
                             ','.join([str(x) for x in feature_ids]) + '\n')
                    gy.write(m_id + '\t' +
                             ','.join([str(x) for x in label_ids]) + '\n')

                    if mention.start >= 0:
                        print '*******!!!!!!'
                        sent = sentence.get_orig_text()
                        c_start = mention.get_c_start()
                        c_end = mention.get_c_end()
                        gz.write(m_id+'\t'+mention.get_entity()+'\t'+ \
                        str(mention.start) +'\t'+ str(mention.end)+ '\t' + \
                        str(c_start) + '\t' + str(c_end) + '\t' + \
                        sentence.get_text() +  '\n')

                    count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.fileid, sentence.senid
                print sentence
                continue
    print count
    reader.close()
    gx.close()
    gy.close()
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir,
                  requireEmType, isEntityMention):
    #  load feature mapping and label mapping
    feature_map = load_map(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=False,
                             brown_file=brown_file,
                             requireEmType=requireEmType,
                             isEntityMention=isEntityMention,
                             feature_mapping=feature_map,
                             label_mapping=label_map)
    count = 0
    gx = open(outdir + '/test_x.txt', 'w')
    gy = open(outdir + '/test_y.txt', 'w')

    print 'start test feature generation'
    while reader.has_next():
        if count % 10000 == 0 and count != 0:
            sys.stdout.write('process ' + str(count) + ' lines\r')
            sys.stdout.flush()
        sentence = reader.next()
        if isEntityMention:
            mentions = sentence.entityMentions
        else:
            mentions = sentence.relationMentions
        for mention in mentions:
            try:
                if isEntityMention:
                    m_id = '%s_%s_%d_%d' % (sentence.articleId,
                                            sentence.sentId, mention.start,
                                            mention.end)
                else:
                    label1 = sentence.get_em_text(mention.em1Start,
                                                  mention.em1End)
                    label2 = sentence.get_em_text(mention.em2Start,
                                                  mention.em2End)
                    m_id = '%s_%s_%s_%s' % (label1, label2, sentence.articleId,
                                            sentence.sentId)
                    # m_id = '%s_%d_%d_%d_%d_%d'%(sentence.articleId, sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End)
                #print mention.em1Start, mention.em1End, mention.em2Start, mention.em2End
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                gx.write(m_id + '\t' + ','.join([str(x)
                                                 for x in feature_ids]) + '\n')
                gy.write(m_id + '\t' + ','.join([str(x)
                                                 for x in label_ids]) + '\n')
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.articleId, sentence.sentId
                print mention
                continue
    type_test = open(outdir + '/type_test.txt', 'w')
    write_map(ner_feature.label_mapping, type_test)
    print '\n'
    reader.close()
    gx.close()
    gy.close()
    print(ner_feature.lc)
Example #4
0
def pipeline_test(json_file, brown_file, featurefile, labelfile, outdir):
    #  load feature mapping and label mapping
    feature_map = load_map(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=False, brown_file=brown_file, feature_mapping=feature_map, label_mapping=label_map)
    count = 0
    gx = open(outdir+'/test_x.txt', 'w')
    gy = open(outdir+'/test_y.txt', 'w')

    print 'start test feature generation'
    while reader.has_next():
            
        if count%1000 == 0:
            print count
        sentence = reader.next()
        for mention in sentence.mentions:
            try:
                m_id = '%s_%d_%d_%d'%(sentence.fileid, sentence.senid, mention.start, mention.end)
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                if len(label_ids)>0:
                    gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n')
                    gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n')
                    count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.fileid, sentence.senid
                print sentence
                continue
    print count
    reader.close()
    gx.close()
    gy.close()
Example #5
0
def pipeline(json_file, brown_file, outdir):
    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=True, brown_file=brown_file)
    count = 0
    gx = open(outdir+'/train_x.txt', 'w')
    gy = open(outdir+'/train_y.txt', 'w')
    f = open(outdir+'/feature.map', 'w')
    t = open(outdir+'/type.txt', 'w')
    print 'start train feature generation'
    mention_count = 0
    while reader.has_next():
        if count%10000 == 0:
            print count
        sentence = reader.next()
        for mention in sentence.mentions:
            try:
                m_id = '%s_%d_%d_%d'%(sentence.fileid, sentence.senid, mention.start, mention.end)
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n')
                gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n')
                mention_count += 1
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.fileid, sentence.senid, len(sentence.tokens)
                print mention
    print 'mention :%d'%mention_count
    print 'feature :%d'%len(ner_feature.feature_mapping)
    print 'label :%d'%len(ner_feature.label_mapping)
    write_map(ner_feature.feature_mapping, f)
    write_map(ner_feature.label_mapping, t)
    reader.close()
    gx.close()
    gy.close()
    f.close()
    t.close()
def pipeline(json_file, brown_file, outdir, requireEmType, isEntityMention):
    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=True, brown_file=brown_file, requireEmType=requireEmType, isEntityMention=isEntityMention, feature_mapping={}, label_mapping={})
    count = 0
    gx = open(outdir+'/train_x.txt', 'w')
    gy = open(outdir+'/train_y.txt', 'w')
    f = open(outdir+'/feature.map', 'w')
    t = open(outdir+'/type.txt', 'w')
    label_counts_file = open(outdir+'/label_counts.txt', 'w')
    print 'start train feature generation'
    mention_count = 0
    mentionCountByNumOfLabels = {}
    while reader.has_next():
        if count%10000 == 0:
            sys.stdout.write('process ' + str(count) + ' lines\r')
            sys.stdout.flush()
        sentence = reader.next()
        if isEntityMention:
            mentions = sentence.entityMentions
        else:
            mentions = sentence.relationMentions
        for mention in mentions:
            try:
                if isEntityMention:
                    m_id = '%s_%s_%d_%d'%(sentence.articleId, sentence.sentId, mention.start, mention.end)
                else:
                    m_id = '%s_%d_%d_%d_%d_%d'%(sentence.articleId, sentence.sentId, mention.em1Start, mention.em1End, mention.em2Start, mention.em2End)
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                if len(label_ids) not in mentionCountByNumOfLabels:
                    mentionCountByNumOfLabels[len(label_ids)] = 1
                else:
                    mentionCountByNumOfLabels[len(label_ids)] += 1
                gx.write(m_id+'\t'+','.join([str(x) for x in feature_ids])+'\n')
                gy.write(m_id+'\t'+','.join([str(x) for x in label_ids])+'\n')
                mention_count += 1
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.articleId, sentence.sentId, len(sentence.tokens)
                print mention
                raise
    print '\n'
    print 'mention :%d'%mention_count
    print 'feature :%d'%len(ner_feature.feature_mapping)
    print 'label :%d'%len(ner_feature.label_mapping)
    sorted_map = sorted(mentionCountByNumOfLabels.items(),cmp=lambda x,y:x[0]-y[0])
    for item in sorted_map:
        label_counts_file.write(str(item[0])+'\t'+str(item[1])+'\n')
    write_map(ner_feature.feature_mapping, f)
    write_map(ner_feature.label_mapping, t)
    reader.close()
    gx.close()
    gy.close()
    f.close()
    t.close()
Example #7
0
def pipeline(json_file, brown_file, outdir):
    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=True, brown_file=brown_file)
    count = 0
    gx = open(outdir + '/train_x.txt', 'w')
    gy = open(outdir + '/train_y.txt', 'w')
    gd = open(outdir + '/mention_reader_debug.txt', 'w')
    f = open(outdir + '/feature.map', 'w')
    t = open(outdir + '/type.txt', 'w')
    print 'start train feature generation'
    mention_count = 0
    while reader.has_next():
        if count % 10000 == 0:
            print count
        sentence = reader.next()
        gd.write(str(sentence) + '\n')
        for mention in sentence.mentions:
            try:
                m_id = '%s_%d_%d_%d' % (sentence.fileid, sentence.senid,
                                        mention.start, mention.end)
                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                gx.write(m_id + '\t' + ','.join([str(x)
                                                 for x in feature_ids]) + '\n')
                gy.write(m_id + '\t' + ','.join([str(x)
                                                 for x in label_ids]) + '\n')

                mention_count += 1
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.fileid, sentence.senid, len(sentence.tokens)
                print mention
    print 'mention :%d' % mention_count
    print 'feature :%d' % len(ner_feature.feature_mapping)
    print 'label :%d' % len(ner_feature.label_mapping)
    write_map(ner_feature.feature_mapping, f)
    write_map(ner_feature.label_mapping, t)
    reader.close()
    gx.close()
    gy.close()
    f.close()
    t.close()
Example #8
0
def pipeline_qa(json_file, brown_file, featurefile, labelfile, outdir,
                requireEmType, isEntityMention):
    feature_map = load_map_qa(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=True,
                             brown_file=brown_file,
                             requireEmType=requireEmType,
                             isEntityMention=isEntityMention,
                             feature_mapping=feature_map,
                             label_mapping=label_map)
    ner_feature.feature_count = len(feature_map)
    count = 0
    gx = open(outdir + '/qa_x.txt', 'w')
    gy = open(outdir + '/qa_y.txt', 'w')
    f = open(outdir + '/feature.map', 'w')
    #t = open(outdir+'/type.txt', 'w')
    print 'start qa em pairs feature generation'
    mention_count = 0
    mentionCountByNumOfLabels = {}
    question2mentions = {}
    while reader.has_next():
        if count % 10000 == 0:
            sys.stdout.write('process ' + str(count) + ' lines\r')
            sys.stdout.flush()
        sentence = reader.next()
        question = sentence.articleId
        sentLabel = sentence.label
        assert sentLabel != None
        if isEntityMention:
            mentions = sentence.entityMentions
        else:
            mentions = sentence.relationMentions
            if sentLabel == 'pos':
                assert len(mentions) == 1
        for mention in mentions:
            try:
                if isEntityMention:
                    m_id = '%s_%s_%d_%d' % (sentence.articleId,
                                            sentence.sentId, mention.start,
                                            mention.end)
                else:
                    m_id = '%s_%d_%d_%d_%d_%d' % (
                        sentence.articleId, sentence.sentId, mention.em1Start,
                        mention.em1End, mention.em2Start, mention.em2End)
                    if question not in question2mentions:
                        question2mentions[question] = {}
                    if sentLabel in question2mentions[question]:
                        question2mentions[question][sentLabel].add(m_id)
                    else:
                        question2mentions[question][sentLabel] = set([m_id])

                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                if len(label_ids) not in mentionCountByNumOfLabels:
                    mentionCountByNumOfLabels[len(label_ids)] = 1
                else:
                    mentionCountByNumOfLabels[len(label_ids)] += 1
                gx.write(m_id + '\t' + ','.join([str(x)
                                                 for x in feature_ids]) + '\n')
                gy.write(m_id + '\t' + ','.join([str(x)
                                                 for x in label_ids]) + '\n')
                mention_count += 1
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.articleId, sentence.sentId, len(sentence.tokens)
                print mention
                raise
    print '\n'
    print 'mention :%d' % mention_count
    print 'feature :%d' % len(ner_feature.feature_mapping)
    print 'label :%d' % len(ner_feature.label_mapping)
    sorted_map = sorted(mentionCountByNumOfLabels.items(),
                        cmp=lambda x, y: x[0] - y[0])
    write_map(ner_feature.feature_mapping, f)
    #write_map(ner_feature.label_mapping, t)
    reader.close()
    gx.close()
    gy.close()
    f.close()
    #t.close()

    qa_pair = open(outdir + '/qa_pair.txt', 'w')
    qa_mpair = open(outdir + '/qa_mpair.txt', 'w')
    question_map = open(outdir + '/question.txt', 'w')
    qid = 0
    for question in question2mentions:
        if len(question2mentions[question]) < 2:
            continue
        for mid in question2mentions[question]['pos']:
            qa_pair.write(mid + '\t' + str(qid) + '\t1.0\n')
        for mid in question2mentions[question]['neg']:
            qa_pair.write(mid + '\t' + str(qid) + '\t0.0\n')
        question_map.write(question + '\t' + str(qid) + '\n')
        qid += 1
        for mid1 in question2mentions[question]['pos']:
            for mid2 in question2mentions[question]['pos']:
                if mid1 == mid2:
                    continue
                qa_mpair.write(mid1 + '\t' + mid2 + '\t1\n')
            for mid2 in question2mentions[question]['neg']:
                qa_mpair.write(mid1 + '\t' + mid2 + '\t0\n')
    qa_mpair.close()
    qa_pair.close()
    question_map.close()
def pipeline_qa(json_file, brown_file, featurefile, labelfile, outdir,
                requireEmType, isEntityMention):
    feature_map = load_map_qa(featurefile)
    label_map = load_map(labelfile)

    reader = MentionReader(json_file)
    ner_feature = NERFeature(is_train=True,
                             brown_file=brown_file,
                             requireEmType=requireEmType,
                             isEntityMention=isEntityMention,
                             feature_mapping=feature_map,
                             label_mapping=label_map)
    ner_feature.feature_count = len(feature_map)
    count = 0
    gx = open(outdir + '/qa_x.txt', 'w')
    f = open(outdir + '/feature.map', 'w')
    if not isEntityMention:
        question_file = open(outdir + '/question.txt', 'w')
    #t = open(outdir+'/type.txt', 'w')
    print 'start train feature generation'
    mention_count = 0

    while reader.has_next():
        if count % 10000 == 0:
            sys.stdout.write('process ' + str(count) + ' lines\r')
            sys.stdout.flush()
        sentence = reader.next()
        question = sentence.articleId

        if isEntityMention:
            mentions = sentence.entityMentions
        else:
            mentions = sentence.relationMentions

        for mention in mentions:
            if not isEntityMention:
                if sentence.questionPositions:
                    questionPositions = ' '.join(
                        [str(p) for p in sentence.questionPositions])
                    question_file.write(
                        str(mention_count) + '\t' + question + '\t' +
                        questionPositions + '\n')
                else:
                    question_file.write(
                        str(mention_count) + '\t' + question + '\n')
            try:
                if isEntityMention:
                    m_id = '%s_%d_%d' % (sentence.sentId, mention.start,
                                         mention.end)
                else:
                    m_id = '%d_%d_%d_%d_%d' % (
                        sentence.sentId, mention.em1Start, mention.em1End,
                        mention.em2Start, mention.em2End)

                feature_ids, label_ids = ner_feature.extract(sentence, mention)
                gx.write(m_id + '\t' + ','.join([str(x)
                                                 for x in feature_ids]) + '\n')
                mention_count += 1
                count += 1
            except Exception as e:
                print e.message, e.args
                print sentence.articleId, sentence.sentId, len(sentence.tokens)
                print mention
                raise
    print '\n'
    print 'mention :%d' % mention_count
    print 'feature :%d' % len(ner_feature.feature_mapping)
    print 'label :%d' % len(ner_feature.label_mapping)

    write_map(ner_feature.feature_mapping, f)

    reader.close()
    gx.close()
    f.close()