Esempio n. 1
0
def worker(args):
    rc_dir, out_dir, hours, event, ad_dir, log_file = args
    vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl')
    clf_pkl = os.path.join(ad_dir, 'article_clf.pkl')
    artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event)
    lgf = open(log_file, 'w') 


    n_hours = len(hours)
    for h, hour  in enumerate(hours, 1):

        n_docs = 0    
        n_sents = 0
        n_rel_docs = 0
        n_rel_sents = 0
    
        #print u'({}/{}) hour: {}'.format(h, n_hours, hour)
        chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour))
        opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour)))
        ochunk = sc.Chunk(path=opath, mode='wb')
        try:
            for si_idx, si in enumerate(sc.Chunk(path=chunk)):

                
                n_docs += 1
                if u'serif' in si.body.sentences:
                    annotator = u'serif'
                elif u'lingpipe' in si.body.sentences:
                    annotator = u'lingpipe'
                else:
                    continue
                
                n_sents += len(si.body.sentences[annotator])
                sent_idxs = artcl_detect.find_articles(si, annotator) 
                n_idxs = len(sent_idxs)
                if n_idxs > 0:
                    n_rel_docs += 1
                    n_rel_sents += n_idxs
                    rel_sents = []
                    for sent_idx in sent_idxs:
                        rel_sents.append(si.body.sentences[annotator][sent_idx])
                    si.body.sentences['article-clf'] = rel_sents 
                        
                    ochunk.add(si) 
            
            ochunk.close()    
            lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents,
                                                    n_rel_docs, n_rel_sents))
            lgf.flush()
        except IOError, e:
            print str(e)  
Esempio n. 2
0
def worker(args):
    rc_dir, out_dir, hours, event, ad_dir, log_file = args
    vct_pkl = os.path.join(ad_dir, 'article_vectorizer.pkl')
    clf_pkl = os.path.join(ad_dir, 'article_clf.pkl')
    artcl_detect = ArticleDetector(vct_pkl, clf_pkl, event)
    lgf = open(log_file, 'w')

    n_hours = len(hours)
    for h, hour in enumerate(hours, 1):

        n_docs = 0
        n_sents = 0
        n_rel_docs = 0
        n_rel_sents = 0

        #print u'({}/{}) hour: {}'.format(h, n_hours, hour)
        chunk = os.path.join(rc_dir, '{}.sc.gz'.format(hour))
        opath = str(os.path.join(out_dir, '{}.sc.gz'.format(hour)))
        ochunk = sc.Chunk(path=opath, mode='wb')
        try:
            for si_idx, si in enumerate(sc.Chunk(path=chunk)):

                n_docs += 1
                if u'serif' in si.body.sentences:
                    annotator = u'serif'
                elif u'lingpipe' in si.body.sentences:
                    annotator = u'lingpipe'
                else:
                    continue

                n_sents += len(si.body.sentences[annotator])
                sent_idxs = artcl_detect.find_articles(si, annotator)
                n_idxs = len(sent_idxs)
                if n_idxs > 0:
                    n_rel_docs += 1
                    n_rel_sents += n_idxs
                    rel_sents = []
                    for sent_idx in sent_idxs:
                        rel_sents.append(
                            si.body.sentences[annotator][sent_idx])
                    si.body.sentences['article-clf'] = rel_sents

                    ochunk.add(si)

            ochunk.close()
            lgf.write('{}\t{}\t{}\t{}\t{}\n'.format(hour, n_docs, n_sents,
                                                    n_rel_docs, n_rel_sents))
            lgf.flush()
        except IOError, e:
            print str(e)
Esempio n. 3
0
def _article_resource_worker(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    event = kwargs.get(u'event')
    corpus = kwargs.get(u'corpus')
    while not job_queue.empty():
        try:
            opath, chunk_paths = job_queue.get(block=False)
            artcl_detect = ArticleDetector(event)
            patt = event.regex_pattern()
            with sc.Chunk(path=opath, mode='wb', message=corpus.sc_msg()) as ochunk:
                for path in chunk_paths:
                    for si in sc.Chunk(path=path, message=corpus.sc_msg()):
                        if si.body.clean_visible is None:
                            continue
                        
                        elif patt.search(si.body.clean_visible, re.I):
                            
                            #if corpus.annotator() not in si.body.sentences:
                            #    continue
                            sentences = corpus.get_sentences(si)
                            sent_idxs = artcl_detect.find_articles(
                                sentences)
                            if len(sent_idxs) > 0:
                                rel_sents = []
                                for sent_idx in sent_idxs:
                                    #for token in sentences[sent_idx].tokens:
                                    #    print token.token,
                                    #print
                                    rel_sents.append(sentences[sent_idx])
                                si.body.sentences[u'article-clf'] = rel_sents
                                ochunk.add(si)


            result_queue.put(None)
        except Queue.Empty:
            pass