Example #1
0
File: run.py Project: frnsys/galaxy
def clean(datapath):
    """
    Fix encoding errors in a data file and
    gets rid of data which still seems problematic.
    """
    red_flags = ['â€', 'Â']

    with open(datapath, 'r') as file:
        data = json.load(file)

    bad = []
    good = []
    for article in progress(data, 'Fixing {0} articles...'.format(len(data))):
        for key in ['title', 'text']:
            article[key] = fix_text_segment(article[key])

        flagged = False
        for flag in red_flags:
            if flag in article['text'] + article['title']:
                bad.append(article)
                flagged = True
                break
        if not flagged:
            good.append(article)

    print('Getting rid of {0} bad articles.'.format(len(bad)))

    outpath = datapath.replace('.json', '_cleaned.json')
    with open(outpath, 'w') as file:
        json.dump(good, file)
Example #2
0
def clean(datapath):
    """
    Fix encoding errors in a data file and
    gets rid of data which still seems problematic.
    """
    red_flags = ['â€', 'Â']

    with open(datapath, 'r') as file:
        data = json.load(file)

    bad = []
    good = []
    for article in progress(data, 'Fixing {0} articles...'.format(len(data))):
        for key in ['title', 'text']:
            article[key] = fix_text_segment(article[key])

        flagged = False
        for flag in red_flags:
            if flag in article['text'] + article['title']:
                bad.append(article)
                flagged = True
                break
        if not flagged:
            good.append(article)

    print('Getting rid of {0} bad articles.'.format(len(bad)))

    outpath = datapath.replace('.json', '_cleaned.json')
    with open(outpath, 'w') as file:
        json.dump(good, file)
Example #3
0
def build_kc_vectors(articles, savepath=None):
    bow_vecs, concept_vecs = [], []

    for a in progress(articles, 'Building article vectors...'):
        bow_vecs.append(a.vectors)
        concept_vecs.append(a.concept_vectors)

    print('Merging vectors...')
    vecs = hstack([bow_vecs, concept_vecs])
    print('Using {0} features.'.format(vecs.shape[1]))

    if savepath:
        with open(savepath, 'wb') as f:
            pickle.dump(vecs, f)

    return vecs
Example #4
0
def build_kc_vectors(articles, savepath=None):
    bow_vecs, concept_vecs = [], []

    for a in progress(articles, "Building article vectors..."):
        bow_vecs.append(a.vectors)
        concept_vecs.append(a.concept_vectors)

    print("Merging vectors...")
    vecs = hstack([bow_vecs, concept_vecs])
    print("Using {0} features.".format(vecs.shape[1]))

    if savepath:
        with open(savepath, "wb") as f:
            pickle.dump(vecs, f)

    return vecs
Example #5
0
def train(docs, n_components=200, pipetype='stanford'):
    """
    Trains and serializes (pickles) a vectorizing pipeline
    based on training data.

    `min_df` is set to filter out extremely rare words,
    since we don't want those to dominate the distance metric.

    `max_df` is set to filter out extremely common words,
    since they don't convey much information.
    """
    pipeline = Pipeline([
        ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=ConceptTokenizer(), min_df=0.01, max_df=0.9)),
        ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
        ('feature_reducer', TruncatedSVD(n_components=n_components)),
        ('normalizer', Normalizer(copy=False))
    ])

    print('Training on {0} docs...'.format(len(docs)))

    cons = []
    from eval.util import progress

    # Hint: n_components=150 is a good value here.
    if pipetype == 'keyword':
        for doc in progress(docs, 'Extracting concepts...'):
            cons.append('||'.join(keywords(doc)))

    # Hint: n_components=200 is a good value here.
    elif pipetype == 'stanford':
        for doc in progress(docs, 'Extracting concepts...'):
            cons.append('||'.join(concepts(doc, strategy='stanford')))

    # Hint: n_components=200 is a good value here.
    elif pipetype == 'spotlight':
        from http.client import BadStatusLine
        from time import sleep
        problems = 0
        max_retries = 5
        for doc in progress(docs, 'Extracting concepts...'):
            retries = 0
            while retries < max_retries:
                try:
                    cons.append('||'.join(concepts(doc, strategy='spotlight')))
                    break
                except BadStatusLine:
                    if retries > max_retries:
                        raise
                    sleep(1*retries)
                    retries += 1
                    problems += 1
        print('Had {0} problems.'.format(problems))
    else:
        raise Exception('Unrecognized pipeline pipetype: {0}.'.format(pipetype))

    # temp
    with open('/Users/ftseng/{0}.json'.format(pipetype), 'w') as f:
        json.dump(cons, f)

    pipeline.fit(cons)

    pipe.save_pipeline(pipeline, pipetype)
    print('Training complete.')
Example #6
0
def train(docs, n_components=200, pipetype='stanford'):
    """
    Trains and serializes (pickles) a vectorizing pipeline
    based on training data.

    `min_df` is set to filter out extremely rare words,
    since we don't want those to dominate the distance metric.

    `max_df` is set to filter out extremely common words,
    since they don't convey much information.
    """
    pipeline = Pipeline([
        ('vectorizer',
         CountVectorizer(input='content',
                         stop_words='english',
                         lowercase=True,
                         tokenizer=ConceptTokenizer(),
                         min_df=0.01,
                         max_df=0.9)),
        ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
        ('feature_reducer', TruncatedSVD(n_components=n_components)),
        ('normalizer', Normalizer(copy=False))
    ])

    print('Training on {0} docs...'.format(len(docs)))

    cons = []
    from eval.util import progress

    # Hint: n_components=150 is a good value here.
    if pipetype == 'keyword':
        for doc in progress(docs, 'Extracting concepts...'):
            cons.append('||'.join(keywords(doc)))

    # Hint: n_components=200 is a good value here.
    elif pipetype == 'stanford':
        for doc in progress(docs, 'Extracting concepts...'):
            cons.append('||'.join(concepts(doc, strategy='stanford')))

    # Hint: n_components=200 is a good value here.
    elif pipetype == 'spotlight':
        from http.client import BadStatusLine
        from time import sleep
        problems = 0
        max_retries = 5
        for doc in progress(docs, 'Extracting concepts...'):
            retries = 0
            while retries < max_retries:
                try:
                    cons.append('||'.join(concepts(doc, strategy='spotlight')))
                    break
                except BadStatusLine:
                    if retries > max_retries:
                        raise
                    sleep(1 * retries)
                    retries += 1
                    problems += 1
        print('Had {0} problems.'.format(problems))
    else:
        raise Exception(
            'Unrecognized pipeline pipetype: {0}.'.format(pipetype))

    # temp
    with open('/Users/ftseng/{0}.json'.format(pipetype), 'w') as f:
        json.dump(cons, f)

    pipeline.fit(cons)

    pipe.save_pipeline(pipeline, pipetype)
    print('Training complete.')