def import_webap(argv): """ Import WebAP data """ parser = argparse.ArgumentParser( prog='import_webap', formatter_class=argparse.RawDescriptionHelpFormatter, description=IMPORTER_DESCRIPTION, epilog=IMPORTER_EPILOG, ) parser.add_argument('-m', dest='model', metavar='DIR', required=True, help='store the processed data in DIR') parser.add_argument('query_file', help='query file, in JSON format') parser.add_argument('corpus_file', help='corpus file, modified TRECTEXT format') args = parser.parse_args(argv) model = summaryrank.Model(args.model) # process and save query topics topics = get_topics(summaryrank.open(args.query_file)) qids = [m['qid'] for _, m in topics] model.save_topics(topics) # process corpus data and save sentences sentences = get_sentences(summaryrank.open(args.corpus_file)) model.save_sentences_qrels(sentences, qids=set(qids))
def import_trec_novelty(argv): """ Import TREC Novelty Track data """ parser = argparse.ArgumentParser( prog='import_trec_novelty', formatter_class=argparse.RawDescriptionHelpFormatter, description=IMPORTER_DESCRIPTION, epilog=IMPORTER_EPILOG, ) parser.add_argument('-m', dest='model', metavar='DIR', required=True, help='store the processed data in DIR') parser.add_argument('query_file', help='query file, in TREC format') parser.add_argument('corpus_file', help='corpus file, a tarball as distributed by TREC') parser.add_argument('qrels_file', help='relevance judgment file') args = parser.parse_args(argv) model = summaryrank.Model(args.model) # process and save query topics topics = get_topics(summaryrank.open(args.query_file)) qids = [m['qid'] for _, m in topics] model.save_topics(topics) # process corpus data and save sentences qrels = get_qrels_set(summaryrank.open(args.qrels_file)) sentences = get_sentences(tarfile.open(args.corpus_file, 'r:gz'), qids=qids, qrels=qrels, charset='latin-1') model.save_sentences_qrels(sentences, qids=set(qids))
def import_mobileclick(argv): """ Import MobileClick-2 data """ parser = argparse.ArgumentParser( prog='import_mobileclick', formatter_class=argparse.RawDescriptionHelpFormatter, description=IMPORTER_DESCRIPTION, epilog=IMPORTER_EPILOG, ) parser.add_argument('-m', dest='model', metavar='DIR', required=True, help='store the processed data in DIR') parser.add_argument('queries_file') parser.add_argument('iunits_file') parser.add_argument('weights_file', nargs='?') args = parser.parse_args(argv) model = summaryrank.Model(args.model) # process and save query topics topics = list(get_topics(summaryrank.open(args.queries_file))) model.save_topics(topics) qids = [m['qid'] for _, m in topics] # process corpus data and save sentences if args.weights_file: qrels = get_qrels(summaryrank.open(args.weights_file)) else: qrels = {} sentences = get_sentences(summaryrank.open(args.iunits_file), qrels=qrels, charset='utf8') model.save_sentences_qrels(sentences, qids=set(qids))
def contextualize(argv): """ Generate context features Generate two context features SentenceBefore[XYZ] and SentenceAfter[XYZ] for each feature XYZ in the given set. """ parser = AutoHelpArgumentParser(prog='contextualize') parser.add_argument('-f', dest='fields', metavar='LIST', help='select only these fields') parser.add_argument('vector_file', help='the input vector file') args = parser.parse_args(argv) selector = set() if args.fields: for comp in args.fields.split(','): if comp.find('-') >= 0: l, u = map(int, comp.split('-')) selector.update(range(l, u + 1)) else: selector.add(int(comp)) rows = svmlight_tools.get_rows(summaryrank.open(args.vector_file), with_preamble=True) preamble = next(rows) features = [] for fid, name in svmlight_tools.get_preamble_features(preamble): if not args.fields: selector.add(fid) if fid in selector: features.append('SentenceBefore[{}]'.format(name)) features.append('SentenceAfter[{}]'.format(name)) print '# Features in use' for fid, name in enumerate(features, 1): print '# {}: {}'.format(fid, name) new_fids = len(features) + 1 # From here onwards is Damiano's contribution pointer1 = None pointer2 = None pointer3 = None for line in rows: pointer1 = pointer2 pointer2 = pointer3 pointer3 = line new_features = {} if pointer2: current_head, current_comment = pointer2.split('# ') _, current_docid, _ = current_comment.split(':') current_fields = current_head.split() #SentenceBefore context feature: if not pointer1: # first sentence for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: #is it from the same document? previous_head, previous_comment = pointer1.split('# ') _, previous_docid, _ = previous_comment.split(':') previous_fields = dict([f.split(':') for f in previous_head.split()[2:]]) if previous_docid != current_docid: for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: new_fid = 1 for fid in selector: before_value = previous_fields[str(fid)] new_features[new_fid] = before_value new_fid += 2 #SencenceAfter context feature: next_head, _ = pointer3.split('# ') _, next_docid, _ = current_comment.split(':') next_fields = dict([f.split(':') for f in next_head.split()[2:]]) if next_docid != current_docid: for fid in range(1, new_fids): if fid % 2 == 0: new_features[fid] = 0 else: new_fid = 2 for fid in selector: after_value = next_fields[str(fid)] new_features[new_fid] = after_value new_fid += 2 #Print before and after: buffer_ = [current_fields[0], current_fields[1]] # print new_fids for k, v in new_features.iteritems(): buffer_.append('{}:{}'.format(k, v)) # print ' '.join(buffer_) print ' '.join(buffer_), '#', current_comment, # Special case: end of file current_head, current_comment = pointer3.split('# ') _, current_docid, _ = current_comment.split(':') current_fields = current_head.split() previous_head, previous_comment = pointer2.split('# ') _, previous_docid, _ = previous_comment.split(':') previous_fields = dict([f.split(':') for f in previous_head.split()[2:]]) new_features = {} #add BeforeSentence features if previous_docid != current_docid: for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: new_fid = 1 for fid in selector: before_value = previous_fields[str(fid)] new_features[new_fid] = before_value new_fid += 2 #Add AfterSentence features for fid in range(1, new_fids): if fid % 2 == 0: new_features[fid] = 0 buffer_ = [current_fields[0], current_fields[1]] # print new_fids for k, v in new_features.iteritems(): buffer_.append('{}:{}'.format(k, v)) # print ' '.join(buffer_) print ' '.join(buffer_), '#', current_comment,
def contextualize(argv): """ Generate context features Generate two context features SentenceBefore[XYZ] and SentenceAfter[XYZ] for each feature XYZ in the given set. """ parser = AutoHelpArgumentParser(prog='contextualize') parser.add_argument('-f', dest='fields', metavar='LIST', help='select only these fields') parser.add_argument('vector_file', help='the input vector file') args = parser.parse_args(argv) selector = set() if args.fields: for comp in args.fields.split(','): if comp.find('-') >= 0: l, u = map(int, comp.split('-')) selector.update(range(l, u + 1)) else: selector.add(int(comp)) rows = svmlight_tools.get_rows(summaryrank.open(args.vector_file), with_preamble=True) preamble = next(rows) features = [] for fid, name in svmlight_tools.get_preamble_features(preamble): if not args.fields: selector.add(fid) if fid in selector: features.append('SentenceBefore[{}]'.format(name)) features.append('SentenceAfter[{}]'.format(name)) print '# Features in use' for fid, name in enumerate(features, 1): print '# {}: {}'.format(fid, name) new_fids = len(features) + 1 # From here onwards is Damiano's contribution pointer1 = None pointer2 = None pointer3 = None for line in rows: pointer1 = pointer2 pointer2 = pointer3 pointer3 = line new_features = {} if pointer2: current_head, current_comment = pointer2.split('# ') _, current_docid, _ = current_comment.split(':') current_fields = current_head.split() #SentenceBefore context feature: if not pointer1: # first sentence for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: #is it from the same document? previous_head, previous_comment = pointer1.split('# ') _, previous_docid, _ = previous_comment.split(':') previous_fields = dict( [f.split(':') for f in previous_head.split()[2:]]) if previous_docid != current_docid: for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: new_fid = 1 for fid in selector: before_value = previous_fields[str(fid)] new_features[new_fid] = before_value new_fid += 2 #SencenceAfter context feature: next_head, _ = pointer3.split('# ') _, next_docid, _ = current_comment.split(':') next_fields = dict([f.split(':') for f in next_head.split()[2:]]) if next_docid != current_docid: for fid in range(1, new_fids): if fid % 2 == 0: new_features[fid] = 0 else: new_fid = 2 for fid in selector: after_value = next_fields[str(fid)] new_features[new_fid] = after_value new_fid += 2 #Print before and after: buffer_ = [current_fields[0], current_fields[1]] # print new_fids for k, v in new_features.iteritems(): buffer_.append('{}:{}'.format(k, v)) # print ' '.join(buffer_) print ' '.join(buffer_), '#', current_comment, # Special case: end of file current_head, current_comment = pointer3.split('# ') _, current_docid, _ = current_comment.split(':') current_fields = current_head.split() previous_head, previous_comment = pointer2.split('# ') _, previous_docid, _ = previous_comment.split(':') previous_fields = dict([f.split(':') for f in previous_head.split()[2:]]) new_features = {} #add BeforeSentence features if previous_docid != current_docid: for fid in range(1, new_fids): if fid % 2 != 0: new_features[fid] = 0 else: new_fid = 1 for fid in selector: before_value = previous_fields[str(fid)] new_features[new_fid] = before_value new_fid += 2 #Add AfterSentence features for fid in range(1, new_fids): if fid % 2 == 0: new_features[fid] = 0 buffer_ = [current_fields[0], current_fields[1]] # print new_fids for k, v in new_features.iteritems(): buffer_.append('{}:{}'.format(k, v)) # print ' '.join(buffer_) print ' '.join(buffer_), '#', current_comment,