Exemple #1
0
def import_webap(argv):
    """ Import WebAP data """
    parser = argparse.ArgumentParser(
        prog='import_webap',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=IMPORTER_DESCRIPTION,
        epilog=IMPORTER_EPILOG,
    )

    parser.add_argument('-m', dest='model', metavar='DIR', required=True,
                        help='store the processed data in DIR')
    parser.add_argument('query_file',
                        help='query file, in JSON format')
    parser.add_argument('corpus_file',
                        help='corpus file, modified TRECTEXT format')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    # process and save query topics
    topics = get_topics(summaryrank.open(args.query_file))
    qids = [m['qid'] for _, m in topics]
    model.save_topics(topics)

    # process corpus data and save sentences
    sentences = get_sentences(summaryrank.open(args.corpus_file))
    model.save_sentences_qrels(sentences, qids=set(qids))
def import_trec_novelty(argv):
    """ Import TREC Novelty Track data """
    parser = argparse.ArgumentParser(
        prog='import_trec_novelty',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=IMPORTER_DESCRIPTION,
        epilog=IMPORTER_EPILOG,
    )

    parser.add_argument('-m', dest='model', metavar='DIR', required=True,
                        help='store the processed data in DIR')
    parser.add_argument('query_file',
                        help='query file, in TREC format')
    parser.add_argument('corpus_file',
                        help='corpus file, a tarball as distributed by TREC')
    parser.add_argument('qrels_file',
                        help='relevance judgment file')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    # process and save query topics
    topics = get_topics(summaryrank.open(args.query_file))
    qids = [m['qid'] for _, m in topics]
    model.save_topics(topics)

    # process corpus data and save sentences
    qrels = get_qrels_set(summaryrank.open(args.qrels_file))
    sentences = get_sentences(tarfile.open(args.corpus_file, 'r:gz'),
                              qids=qids, qrels=qrels, charset='latin-1')
    model.save_sentences_qrels(sentences, qids=set(qids))
Exemple #3
0
def import_mobileclick(argv):
    """ Import MobileClick-2 data """
    parser = argparse.ArgumentParser(
        prog='import_mobileclick',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=IMPORTER_DESCRIPTION,
        epilog=IMPORTER_EPILOG,
    )

    parser.add_argument('-m',
                        dest='model',
                        metavar='DIR',
                        required=True,
                        help='store the processed data in DIR')
    parser.add_argument('queries_file')
    parser.add_argument('iunits_file')
    parser.add_argument('weights_file', nargs='?')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    # process and save query topics
    topics = list(get_topics(summaryrank.open(args.queries_file)))
    model.save_topics(topics)
    qids = [m['qid'] for _, m in topics]

    # process corpus data and save sentences
    if args.weights_file:
        qrels = get_qrels(summaryrank.open(args.weights_file))
    else:
        qrels = {}
    sentences = get_sentences(summaryrank.open(args.iunits_file),
                              qrels=qrels,
                              charset='utf8')
    model.save_sentences_qrels(sentences, qids=set(qids))
Exemple #4
0
def import_trec_novelty(argv):
    """ Import TREC Novelty Track data """
    parser = argparse.ArgumentParser(
        prog='import_trec_novelty',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=IMPORTER_DESCRIPTION,
        epilog=IMPORTER_EPILOG,
    )

    parser.add_argument('-m',
                        dest='model',
                        metavar='DIR',
                        required=True,
                        help='store the processed data in DIR')
    parser.add_argument('query_file', help='query file, in TREC format')
    parser.add_argument('corpus_file',
                        help='corpus file, a tarball as distributed by TREC')
    parser.add_argument('qrels_file', help='relevance judgment file')
    args = parser.parse_args(argv)

    model = summaryrank.Model(args.model)

    # process and save query topics
    topics = get_topics(summaryrank.open(args.query_file))
    qids = [m['qid'] for _, m in topics]
    model.save_topics(topics)

    # process corpus data and save sentences
    qrels = get_qrels_set(summaryrank.open(args.qrels_file))
    sentences = get_sentences(tarfile.open(args.corpus_file, 'r:gz'),
                              qids=qids,
                              qrels=qrels,
                              charset='latin-1')
    model.save_sentences_qrels(sentences, qids=set(qids))
Exemple #5
0
def contextualize(argv):
    """ Generate context features

    Generate two context features SentenceBefore[XYZ] and SentenceAfter[XYZ] for
    each feature XYZ in the given set.
    """
    parser = AutoHelpArgumentParser(prog='contextualize')
    parser.add_argument('-f', dest='fields', metavar='LIST',
                        help='select only these fields')
    parser.add_argument('vector_file',
                        help='the input vector file')
    args = parser.parse_args(argv)

    selector = set()
    if args.fields:
        for comp in args.fields.split(','):
            if comp.find('-') >= 0:
                l, u = map(int, comp.split('-'))
                selector.update(range(l, u + 1))
            else:
                selector.add(int(comp))

    rows = svmlight_tools.get_rows(summaryrank.open(args.vector_file), with_preamble=True)
    preamble = next(rows)

    features = []
    for fid, name in svmlight_tools.get_preamble_features(preamble):
        if not args.fields:
            selector.add(fid)
        if fid in selector:
            features.append('SentenceBefore[{}]'.format(name))
            features.append('SentenceAfter[{}]'.format(name))

    print '# Features in use'
    for fid, name in enumerate(features, 1):
        print '# {}: {}'.format(fid, name)

    new_fids = len(features) + 1

    # From here onwards is Damiano's contribution
    pointer1 = None
    pointer2 = None
    pointer3 = None

    for line in rows:
        pointer1 = pointer2
        pointer2 = pointer3
        pointer3 = line

        new_features = {}
        if pointer2:
            current_head, current_comment = pointer2.split('# ')
            _, current_docid, _ = current_comment.split(':')
            current_fields = current_head.split()
            #SentenceBefore context feature:
            if not pointer1: # first sentence
                for fid in range(1, new_fids):
                    if fid % 2 != 0:
                        new_features[fid] = 0
            else:
                #is it from the same document?
                previous_head, previous_comment = pointer1.split('# ')
                _, previous_docid, _ = previous_comment.split(':')
                previous_fields = dict([f.split(':') for f in previous_head.split()[2:]])

                if previous_docid != current_docid:
                    for fid in range(1, new_fids):
                        if fid % 2 != 0:
                            new_features[fid] = 0
                else:
                    new_fid = 1
                    for fid in selector:
                        before_value = previous_fields[str(fid)]
                        new_features[new_fid] = before_value
                        new_fid += 2

            #SencenceAfter context feature:

            next_head, _ = pointer3.split('# ')
            _, next_docid, _ = current_comment.split(':')
            next_fields = dict([f.split(':') for f in next_head.split()[2:]])
            if next_docid != current_docid:
                for fid in range(1, new_fids):
                    if fid % 2 == 0:
                        new_features[fid] = 0
            else:
                new_fid = 2
                for fid in selector:
                    after_value = next_fields[str(fid)]
                    new_features[new_fid] = after_value
                    new_fid += 2

            #Print before and after:
            buffer_ = [current_fields[0], current_fields[1]]

            # print new_fids
            for k, v in new_features.iteritems():
                buffer_.append('{}:{}'.format(k, v))
            # print ' '.join(buffer_)
            print ' '.join(buffer_), '#', current_comment,

    # Special case: end of file
    current_head, current_comment = pointer3.split('# ')
    _, current_docid, _ = current_comment.split(':')
    current_fields = current_head.split()

    previous_head, previous_comment = pointer2.split('# ')
    _, previous_docid, _ = previous_comment.split(':')
    previous_fields = dict([f.split(':') for f in previous_head.split()[2:]])

    new_features = {}

    #add BeforeSentence features
    if previous_docid != current_docid:
        for fid in range(1, new_fids):
            if fid % 2 != 0:
                new_features[fid] = 0
    else:
        new_fid = 1
        for fid in selector:
            before_value = previous_fields[str(fid)]
            new_features[new_fid] = before_value
            new_fid += 2

    #Add AfterSentence features
    for fid in range(1, new_fids):
        if fid % 2 == 0:
            new_features[fid] = 0
    buffer_ = [current_fields[0], current_fields[1]]

    # print new_fids
    for k, v in new_features.iteritems():
        buffer_.append('{}:{}'.format(k, v))
    # print ' '.join(buffer_)
    print ' '.join(buffer_), '#', current_comment,
Exemple #6
0
def contextualize(argv):
    """ Generate context features

    Generate two context features SentenceBefore[XYZ] and SentenceAfter[XYZ] for
    each feature XYZ in the given set.
    """
    parser = AutoHelpArgumentParser(prog='contextualize')
    parser.add_argument('-f',
                        dest='fields',
                        metavar='LIST',
                        help='select only these fields')
    parser.add_argument('vector_file', help='the input vector file')
    args = parser.parse_args(argv)

    selector = set()
    if args.fields:
        for comp in args.fields.split(','):
            if comp.find('-') >= 0:
                l, u = map(int, comp.split('-'))
                selector.update(range(l, u + 1))
            else:
                selector.add(int(comp))

    rows = svmlight_tools.get_rows(summaryrank.open(args.vector_file),
                                   with_preamble=True)
    preamble = next(rows)

    features = []
    for fid, name in svmlight_tools.get_preamble_features(preamble):
        if not args.fields:
            selector.add(fid)
        if fid in selector:
            features.append('SentenceBefore[{}]'.format(name))
            features.append('SentenceAfter[{}]'.format(name))

    print '# Features in use'
    for fid, name in enumerate(features, 1):
        print '# {}: {}'.format(fid, name)

    new_fids = len(features) + 1

    # From here onwards is Damiano's contribution
    pointer1 = None
    pointer2 = None
    pointer3 = None

    for line in rows:
        pointer1 = pointer2
        pointer2 = pointer3
        pointer3 = line

        new_features = {}
        if pointer2:
            current_head, current_comment = pointer2.split('# ')
            _, current_docid, _ = current_comment.split(':')
            current_fields = current_head.split()
            #SentenceBefore context feature:
            if not pointer1:  # first sentence
                for fid in range(1, new_fids):
                    if fid % 2 != 0:
                        new_features[fid] = 0
            else:
                #is it from the same document?
                previous_head, previous_comment = pointer1.split('# ')
                _, previous_docid, _ = previous_comment.split(':')
                previous_fields = dict(
                    [f.split(':') for f in previous_head.split()[2:]])

                if previous_docid != current_docid:
                    for fid in range(1, new_fids):
                        if fid % 2 != 0:
                            new_features[fid] = 0
                else:
                    new_fid = 1
                    for fid in selector:
                        before_value = previous_fields[str(fid)]
                        new_features[new_fid] = before_value
                        new_fid += 2

            #SencenceAfter context feature:

            next_head, _ = pointer3.split('# ')
            _, next_docid, _ = current_comment.split(':')
            next_fields = dict([f.split(':') for f in next_head.split()[2:]])
            if next_docid != current_docid:
                for fid in range(1, new_fids):
                    if fid % 2 == 0:
                        new_features[fid] = 0
            else:
                new_fid = 2
                for fid in selector:
                    after_value = next_fields[str(fid)]
                    new_features[new_fid] = after_value
                    new_fid += 2

            #Print before and after:
            buffer_ = [current_fields[0], current_fields[1]]

            # print new_fids
            for k, v in new_features.iteritems():
                buffer_.append('{}:{}'.format(k, v))
            # print ' '.join(buffer_)
            print ' '.join(buffer_), '#', current_comment,

    # Special case: end of file
    current_head, current_comment = pointer3.split('# ')
    _, current_docid, _ = current_comment.split(':')
    current_fields = current_head.split()

    previous_head, previous_comment = pointer2.split('# ')
    _, previous_docid, _ = previous_comment.split(':')
    previous_fields = dict([f.split(':') for f in previous_head.split()[2:]])

    new_features = {}

    #add BeforeSentence features
    if previous_docid != current_docid:
        for fid in range(1, new_fids):
            if fid % 2 != 0:
                new_features[fid] = 0
    else:
        new_fid = 1
        for fid in selector:
            before_value = previous_fields[str(fid)]
            new_features[new_fid] = before_value
            new_fid += 2

    #Add AfterSentence features
    for fid in range(1, new_fids):
        if fid % 2 == 0:
            new_features[fid] = 0
    buffer_ = [current_fields[0], current_fields[1]]

    # print new_fids
    for k, v in new_features.iteritems():
        buffer_.append('{}:{}'.format(k, v))
    # print ' '.join(buffer_)
    print ' '.join(buffer_), '#', current_comment,