def generate_feature_matrix(data, stemmer, **prune_params):
    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    _parser = HTMLParser()

    sr_index = HashedIndex()

    for url_path, label in data.items():

        if os.path.exists(url_path):
            with open(url_path, 'r') as html_file:
                html_text = html_file.read()

            text = unicode(goose.extract(raw_html=html_text).cleaned_text)
            text = _parser.unescape(text)

            for token in word_tokenize(text, stemmer=stemmer):
                sr_index.add_term_occurrence(token, url_path)

    sr_index.prune(**prune_params)

    X = sr_index.generate_feature_matrix(mode='tfidf')

    y = np.zeros(len(sr_index.documents()))
    for index, doc in enumerate(sr_index.documents()):
        y[index] = 0 if data[doc] is None else 1

    return X, y
Esempio n. 2
0
def cmd_readstream(args, t, active_events):

    import textwrap
    from goose import Goose, Configuration
    config = Configuration()
    config.enable_image_fetching = False
    g = Goose(config)
    raw_stream = True

    for arg in args:
        if arg == "articles":
            raw_stream = False

    for event in active_events:
        print event
        if event.query_id.startswith("TS13"):
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id))

        if raw_stream is True:
            from cuttsum.trecdata import SCChunkResource
            si_iter = SCChunkResource().streamitem_iter(event, corpus)
        else:
            from cuttsum.pipeline import ArticlesResource
            si_iter = ArticlesResource().streamitem_iter(event, corpus)

        for hour, path, si in si_iter:
            if si.body.clean_visible is not None:
                print si.stream_id
                try:
                    text_height = t.height - 4
                    #n_chars = t.
                    article = g.extract(raw_html=si.body.clean_html)
                    lines = textwrap.wrap(article.cleaned_text)
                    idx = 0
                    while 1:
                        print t.clear
                        print "hour:", hour
                        print "title:", article.title
                        print "article:"
                        print "\n".join(lines[idx:idx + text_height])
                        #print article.cleaned_text

                        with t.cbreak():
                            char = t.inkey()
                            if char == "i" and idx > 0:
                                idx -= 1  #idx - 1 if idx > 0 else 0
                            elif char == "k" and idx + text_height < len(
                                    lines):
                                idx += 1
                            elif char == "l":
                                break

                except Exception, e:
                    print e
                    continue
Esempio n. 3
0
def cmd_readstream(args, t, active_events):
    
    import textwrap
    from goose import Goose, Configuration
    config = Configuration()
    config.enable_image_fetching = False
    g = Goose(config)
    raw_stream = True

    for arg in args:
        if arg == "articles":
            raw_stream = False

    for event in active_events:
        print event
        if event.query_id.startswith("TS13"):        
            corpus = cuttsum.corpora.EnglishAndUnknown2013()
        elif event.query_id.startswith("TS14"):
            corpus = cuttsum.corpora.SerifOnly2014()
        else:
            raise Exception("Bad query id: {}".format(event.query_id)) 

        if raw_stream is True:
            from cuttsum.trecdata import SCChunkResource
            si_iter = SCChunkResource().streamitem_iter(event, corpus)
        else:
            from cuttsum.pipeline import ArticlesResource
            si_iter = ArticlesResource().streamitem_iter(event, corpus)

        for hour, path, si in si_iter:
            if si.body.clean_visible is not None:
                print si.stream_id
                try:
                    text_height = t.height-4
                    #n_chars = t.
                    article = g.extract(raw_html=si.body.clean_html)
                    lines = textwrap.wrap(article.cleaned_text)
                    idx = 0
                    while 1:
                        print t.clear
                        print "hour:", hour
                        print "title:", article.title
                        print "article:"
                        print "\n".join(lines[idx:idx+text_height])
                    #print article.cleaned_text                
                    
                        with t.cbreak():
                            char = t.inkey()
                            if char == "i" and idx > 0:
                                idx -= 1 #idx - 1 if idx > 0 else 0
                            elif char == "k" and idx + text_height < len(lines):
                                idx += 1 
                            elif char == "l":
                                break

                except Exception, e:
                    print e
                    continue                
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(
                article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight)
                                     for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0,
                                           results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector
Esempio n. 5
0
 def __init__(self,need_stem):
     #set up goose
     config = Configuration()
     config.enable_image_fetching = False
     self._g = Goose(config)
     self._need_stem = need_stem
from goose import Configuration, Goose
from HTMLParser import HTMLParser

from datasource import load_data_source
from index.hashedindex import HashedIndex, load_meta
from utils import search_files

if __name__ == '__main__':

    import time
    t0 = time.time()

    _parser = HTMLParser()

    _config = Configuration()
    _config.enable_image_fetching = False
    _config.use_meta_language = False

    _goose = Goose(_config)

    # Lancaster Stemmer is very very slow
    _stemmer = textparser.NullStemmer()

    data_path = '/home/michaela/Development/Reddit-Testing-Data'

    # Set the parameters to the program over here
    force_reindex = False
    parameters = {
        'samples': 800,
        'subreddit': 'python',
Esempio n. 7
0
import cuttsum.judgements
import cuttsum.events
import cuttsum.corpora
from cuttsum.misc import stringify_streamcorpus_sentence

event = cuttsum.events.get_events(by_query_ids=["TS13.1"])[0]
corpus = cuttsum.corpora.EnglishAndUnknown2013()

example_path = "/scratch/t-chkedz/trec-data/articles/gold/2012_buenos_aires_rail_disaster/2012-02-23-01.sc.gz"
example_id = "1329959700-18d497cf08e3500f195066be60e6a201"

matches = cuttsum.judgements.get_merged_dataframe()

from goose import Goose, Configuration

config = Configuration()
config.enable_image_fetching = False
g = Goose(config)

def si_to_df(si):
    sents = []
    for s, sent in enumerate(si.body.sentences["lingpipe"]):
        sents.append(            


with sc.Chunk(path=example_path, mode="rb", message=corpus.sc_msg()) as chunk:


    for si in chunk:
        if si.stream_id == example_id:
            for s, sent in enumerate(si.body.sentences["lingpipe"]):
Esempio n. 8
0
 def __init__(self, need_stem):
     #set up goose
     config = Configuration()
     config.enable_image_fetching = False
     self._g = Goose(config)
     self._need_stem = need_stem
Esempio n. 9
0
    def do_job_unit(self, event, corpus, unit, **kwargs):
        
        extractor = kwargs.get("extractor", "gold")
        data_dir = os.path.join(self.dir_, extractor, event.fs_name())
        chunks_resource = SCChunkResource()

        if not os.path.exists(data_dir):	
            try:
                os.makedirs(data_dir)
            except OSError as exc:
                if exc.errno == errno.EEXIST and os.path.isdir(data_dir):
                    pass


        
        if extractor == "gold":
            import cuttsum.judgements
            all_matches = cuttsum.judgements.get_matches()
            matches = all_matches[all_matches["query id"] == event.query_id]
            stream_ids = set(
                matches["update id"].apply(
                    lambda x: "-".join(x.split("-")[:-1])).tolist())

            hours = set([datetime.utcfromtimestamp(
                            int(update_id.split("-")[0])).replace(
                             minute=0, second=0)
                         for update_id in matches["update id"].tolist()])
            hours = sorted(list(hours))
            hour = hours[unit]
            output_path = self.get_chunk_path(event, extractor, hour, corpus)
            gold_si = []
            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                with sc.Chunk(path=path, mode="rb", 
                        message=corpus.sc_msg()) as chunk:
                    for si in chunk:
                        if si.stream_id in stream_ids:
                            gold_si.append(si)

            gold_si.sort(key=lambda x: x.stream_id)
            for si in gold_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(path)            
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in gold_si:
                    chunk.add(si)

        elif extractor == "goose":

            import nltk
            from nltk.tokenize import WordPunctTokenizer
            sent_tok = nltk.data.load('tokenizers/punkt/english.pickle')
            word_tok = WordPunctTokenizer()           
 
            from goose import Goose, Configuration
            config = Configuration()
            config.enable_image_fetching = False
            g = Goose(config)
            
            hour = event.list_event_hours()[unit]
            output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus)
            good_si = []

            for path in chunks_resource.get_chunks_for_hour(hour, corpus, event):
                try:
                    with sc.Chunk(path=path, mode="rb", 
                            message=corpus.sc_msg()) as chunk:
                        
                        for si in chunk:

                            if si.body.clean_visible is None:
                                continue

                            article_text = self._get_goose_text(g, si)
                            if article_text is None:
                                continue

                            if not self._contains_query(event, article_text):
                                continue
                    
                            art_pretty = sent_tok.tokenize(article_text)
                            art_sents = [word_tok.tokenize(sent) 
                                         for sent in art_pretty]

                            df = si2df(si)
                            I = self._map_goose2streamitem(
                                art_sents, df["words"].tolist())
                                
                            if "serif" in si.body.sentences:
                                si_sentences = si.body.sentences["serif"]
                            elif "lingpipe" in si.body.sentences:
                                si_sentences = si.body.sentences["lingpipe"]
                            else:
                                raise Exception("Bad sentence annotator.")
                            
                            ann = sc.Annotator()
                            ann.annotator_id = "goose"
                            si.body.sentences["goose"] = [sc.Sentence() 
                                                          for _ in si_sentences]
                            for i_goose, i_si in enumerate(I):
                                #print art_pretty[i_goose]
                                #print df.loc[i_si, "sent text"]
                                #print
                                tokens = [sc.Token(token=token.encode("utf-8")) 
                                          for token in art_sents[i_goose]]
                                si.body.sentences["goose"][i_si].tokens.extend(
                                    tokens)
                            good_si.append(si)
                except TypeError:
                    continue
            #if len(good_si) == 0:
            #    print "Nothing in hour:", hour
            #    return 
            output_path = output_path_tmp.format(len(good_si))
            odir = os.path.dirname(output_path)
            if not os.path.exists(odir):
                os.makedirs(odir)
            if os.path.exists(output_path):
                os.remove(path)            

            good_si.sort(key=lambda x: x.stream_id)
            for si in good_si:
                print si.stream_id

            if os.path.exists(output_path):
                os.remove(output_path)                

            print "Writing to", output_path
            with sc.Chunk(path=output_path, mode="wb", 
                    message=corpus.sc_msg()) as chunk:
                for si in good_si:
                    chunk.add(si)
        else:
            raise Exception("extractor: {} not implemented!".format(extractor))
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params):
    """
    Transforms a given data source to a corresponding feature matrix and label
    vector based on the "Bag of Concepts" model which uses Wikipedia as an
    exogenous knowledge source for Word Sense Disambiguation and as additional
    domain knowledge.

    Contains logging code which is displayed depending on the currently set
    logging level of the root logger.
    :param wiki: WikiIndex instance to some database index
    :param data: data labels loaded using a load_data_source method
    :param n_concepts: number of concepts to use per page.
    :param word_concept_params: word concept parameters to use for generation of concepts.
    :return: Numpy Feature Matrix and Label Vector.
    """

    config = Configuration()
    config.enable_image_fetching = False
    config.use_meta_language = False
    goose = Goose(config)

    results = {}
    concepts = set()

    # Iterate through the data and perform training
    for index, (abs_path, label) in enumerate(data.items()):
        if not os.path.exists(abs_path):
            continue

        with open(abs_path, 'r') as fp:
            html_text = fp.read()

        # Determine relative path using a simple heuristic
        cutoff = abs_path.find('pages/')
        rel_path = abs_path[cutoff + 6:]

        logging.info('\n%d: http://%s' % (index, rel_path[:-3]))
        article = goose.extract(raw_html=html_text)

        if len(article.cleaned_text) > 500:
            logging.info('%s (%s)', article.title, label)

            search_results, terms, query_vector = wiki.word_concepts(article.cleaned_text, article.title, **word_concept_params)

            if search_results:
                results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]]

                # Remove any concepts which have a weight of 0
                results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path])

                for search_result in search_results[:n_concepts]:
                    concepts.add(search_result.page_id)

                logging.info(search_results[:n_concepts])
            else:
                logging.warn('No word concepts returned')
        else:
            logging.info('Document is of insufficient length')

    shape = (len(results), len(concepts))

    concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)])

    feature_matrix = np.zeros(shape=shape)
    label_vector = np.zeros(len(results))

    for i, (abs_path, page_list) in enumerate(results.iteritems()):
        label_vector[i] = 1 if data[abs_path] is not None else 0

        for page_id, weight in page_list:
            j = concepts_index[page_id]
            feature_matrix[i, j] = weight

    return feature_matrix, label_vector