Ejemplo n.º 1
0
def main():

    event_file, event_title, nuggets_tsv, odir = parse_args()
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]

    updates_file = os.path.join(odir, u"updates.txt")
    write_updates(nuggets, updates_file)

    sum_sents = []
    for hour in hours:
        while len(nuggets) > 0:
            if nuggets[0].timestamp.strftime(u"%Y-%m-%d-%H") <= hour:
                sum_sents.append(nuggets[0].text)
                nuggets.pop(0)
            else:
                break

        if len(sum_sents) > 0:
            ofile = os.path.join(odir, u"{}.txt".format(hour))
            write_summary(sum_sents, ofile)

    if len(nuggets) > 0:
        sum_sents.extend([nugget.text for nugget in nuggets])

    ofile = os.path.join(odir, u"final.txt")
    write_summary(sum_sents, ofile)
Ejemplo n.º 2
0
def main():

    event_file, event_title, nuggets_tsv, odir = parse_args()
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]

    updates_file = os.path.join(odir, u'updates.txt')
    write_updates(nuggets, updates_file)

    sum_sents = []
    for hour in hours:
        while len(nuggets) > 0:
            if nuggets[0].timestamp.strftime(u'%Y-%m-%d-%H') <= hour:
                sum_sents.append(nuggets[0].text)
                nuggets.pop(0)
            else:
                break

        if len(sum_sents) > 0:
            ofile = os.path.join(odir, u'{}.txt'.format(hour))
            write_summary(sum_sents, ofile)

    if len(nuggets) > 0:
        sum_sents.extend([nugget.text for nugget in nuggets])

    ofile = os.path.join(odir, u'final.txt')
    write_summary(sum_sents, ofile)
Ejemplo n.º 3
0
def main():

    event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args()
    ss_model, ss_vocab, ss_dims = ss_params
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]
    print u"Found", len(nuggets), u"nuggets."

    print u"Loading sentence-sim model..."
    wmat_model = cuttsum.wtmf.load_model(ss_model, ss_vocab, latent_dims=ss_dims)    
    nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets])    

    
    meta_data = []
    unicodes = []
    print u"Loading sentence data..."
    nhours = len(hours)
    for h, hour in enumerate(hours, 1):
        chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour))

        for si_idx, si in enumerate(sc.Chunk(path=chunk)):
            if u'article-clf' not in si.body.sentences:
                continue
            sent_idx_map = {}
            for idx, sent in enumerate(si.body.sentences[u'serif']):
                sent_idx_map[sentence_uni(sent)] = idx
            for sent in si.body.sentences[u'article-clf']:
                uni = sentence_uni(sent)
                meta_data.append((hour, si.stream_id, sent_idx_map[uni]))
                unicodes.append(uni)

    print u"Computing similarities..."
    sent_lvecs = wmat_model.factor_unicode(unicodes)    
    S = cosine_similarity(sent_lvecs, nugget_lvecs)
    S = np.ma.masked_array(S, np.isnan(S))

    Szmuv = (S - S.mean(axis=0)) / S.std(axis=0)
    M = np.amax(Szmuv, axis=1)
    m = np.amin(Szmuv, axis=1)
    U = np.mean(Szmuv, axis=1)
    T = np.sum(Szmuv, axis=1)

    ### WRITE TSV HEADER AND DATA ###
    print u"Writing to", ofile
    header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \
             '\tmean-sim\ttotal-sim'
    for i in range(ss_dims):
        header += '\tlv{}'.format(i)

    with open(ofile, 'w') as f:
        f.write(header)
        f.write('\n') 
        for idx, meta_datum in enumerate(meta_data):
            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(meta_datum[0], meta_datum[1],
                                                    meta_datum[2], M[idx], m[idx], U[idx]))
            for c in range(ss_dims):
                f.write('\t{}'.format(sent_lvecs[idx,c]))
            f.write('\n')
            f.flush()
def main():
    
    args = parse_args()
    event_file, rc_dir, event_title, nuggets_tsv = args[0:4]
    doc_freqs, word_freqs = args[4:]

    print 'Generating Regression Features'
    print '=============================='
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]

    worker((rc_dir, nuggets, hours, event, doc_freqs, word_freqs))
def main():

    args = parse_args()
    event_file, rc_dir, event_title, nuggets_tsv = args[0:4]
    doc_freqs, word_freqs = args[4:]

    print 'Generating Regression Features'
    print '=============================='
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]

    worker((rc_dir, nuggets, hours, event, doc_freqs, word_freqs))
def main():

    event_file, rc_dir, event_title, nuggets_tsv, ss_params, ofile = parse_args(
    )
    ss_model, ss_vocab, ss_dims = ss_params
    event = load_event(event_title, event_file)
    nuggets = read_nuggets_tsv(nuggets_tsv, filter_query_id=event.query_id)
    hours = [dth for dth in gen_dates(event.start, event.end)]
    print u"Found", len(nuggets), u"nuggets."

    print u"Loading sentence-sim model..."
    wmat_model = cuttsum.wtmf.load_model(ss_model,
                                         ss_vocab,
                                         latent_dims=ss_dims)
    nugget_lvecs = wmat_model.factor_unicode([n.text for n in nuggets])

    meta_data = []
    unicodes = []
    print u"Loading sentence data..."
    nhours = len(hours)
    for h, hour in enumerate(hours, 1):
        chunk = os.path.join(rc_dir, u'{}.sc.gz'.format(hour))

        for si_idx, si in enumerate(sc.Chunk(path=chunk)):
            if u'article-clf' not in si.body.sentences:
                continue
            sent_idx_map = {}
            for idx, sent in enumerate(si.body.sentences[u'serif']):
                sent_idx_map[sentence_uni(sent)] = idx
            for sent in si.body.sentences[u'article-clf']:
                uni = sentence_uni(sent)
                meta_data.append((hour, si.stream_id, sent_idx_map[uni]))
                unicodes.append(uni)

    print u"Computing similarities..."
    sent_lvecs = wmat_model.factor_unicode(unicodes)
    S = cosine_similarity(sent_lvecs, nugget_lvecs)
    S = np.ma.masked_array(S, np.isnan(S))

    Szmuv = (S - S.mean(axis=0)) / S.std(axis=0)
    M = np.amax(Szmuv, axis=1)
    m = np.amin(Szmuv, axis=1)
    U = np.mean(Szmuv, axis=1)
    T = np.sum(Szmuv, axis=1)

    ### WRITE TSV HEADER AND DATA ###
    print u"Writing to", ofile
    header = 'date-hour\tstream-id\tsent-id\tmax-sim\tmin-sim' + \
             '\tmean-sim\ttotal-sim'
    for i in range(ss_dims):
        header += '\tlv{}'.format(i)

    with open(ofile, 'w') as f:
        f.write(header)
        f.write('\n')
        for idx, meta_datum in enumerate(meta_data):
            f.write('{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                meta_datum[0], meta_datum[1], meta_datum[2], M[idx], m[idx],
                U[idx]))
            for c in range(ss_dims):
                f.write('\t{}'.format(sent_lvecs[idx, c]))
            f.write('\n')
            f.flush()