Esempio n. 1
0
def test_wordnik_patterns_match():
    from serapis.features import match_wordnik_rules

    with open("serapis/tests/data/patterns.yaml") as f:
        test_cases = yaml.load(f)
    for rule, sentence in test_cases.items():
        assert rule in match_wordnik_rules(sentence), "Rule {} does not match '{}'".format(rule, sentence)
Esempio n. 2
0
def detect(message):
    """Takes a message that must contain a list of URL objects, each having
    at least a doc property. This will split the doc of each URL into
    sentences, and  determine whether each sentence is an FRD or not.
    """
    batch_tag_sentences(message)

    # Load Models
    model_pipeline = PackagedPipeline().get()
    created_at = model_pipeline.metadata['created_at']

    feature_union = model_pipeline._feature_union
    model = model_pipeline._pipeline
    class_idx = np.where(model.classes_ == 1)[0][0]  # index of '1' pred in .predict_proba

    for url_object in message['urls']:
        readability_score(url_object)
        for sentence in url_object['sentences']:
            sentence_clean = sentence['s_clean']
            pos = ' '.join([i[i.find('/') + 1:] for i in sentence['pos_tags'].split()])  # just pos tags
            
            sentence_feature_union = feature_union.transform({
                's_clean': [sentence['s_clean']],
                'pos': [pos]
            })

            # metadata
            sentence['model_creation_date'] = created_at
            
            # predictions from model
            sentence['patterns'] = match_wordnik_rules(sentence_clean)
            sentence['frd'] = model.predict(sentence_feature_union)[0]
            sentence['frd_likelihood'] = round(model.predict_proba(sentence_feature_union)[0][class_idx], 4)  # P(Classification as FRD)

    return write_message('save', message)
Esempio n. 3
0
def test_wordnik_patterns_match():
    from serapis.features import match_wordnik_rules
    with open("serapis/tests/data/patterns.yaml") as f:
        test_cases = yaml.load(f)
    for rule, sentence in test_cases.items():
        assert rule in match_wordnik_rules(
            sentence), "Rule {} does not match '{}'".format(rule, sentence)
Esempio n. 4
0
def test_wordnik_patterns_perc():
    from serapis.features import match_wordnik_rules
    from serapis.preprocess import clean_sentence
    min_coverage = 0.2
    matches = 0.0
    with open("serapis/tests/data/frds_wordnik.csv") as f:
        test_cases = list(csv.reader(f))
    for term, sentence in test_cases:
        s_clean, _ = clean_sentence(sentence, term)
        matches += 1 if match_wordnik_rules(s_clean) else 0
    assert matches / len(
        test_cases) > min_coverage, "Only matched {:.2f}% of data set".format(
            100 * matches / len(test_cases))
Esempio n. 5
0
def test_wordnik_patterns_perc():
    from serapis.features import match_wordnik_rules
    from serapis.preprocess import clean_sentence

    min_coverage = 0.2
    matches = 0.0
    with open("serapis/tests/data/frds_wordnik.csv") as f:
        test_cases = list(csv.reader(f))
    for term, sentence in test_cases:
        s_clean, _ = clean_sentence(sentence, term)
        matches += 1 if match_wordnik_rules(s_clean) else 0
    assert matches / len(test_cases) > min_coverage, "Only matched {:.2f}% of data set".format(
        100 * matches / len(test_cases)
    )
Esempio n. 6
0
def detect(message):
    """Takes a message that must contain a list of URL objects, each having
    at least a doc property. This will split the doc of each URL into
    sentences, and  determine whether each sentence is an FRD or not.
    """
    batch_tag_sentences(message)

    # Load Models
    model_pipeline = PackagedPipeline().get()
    created_at = model_pipeline.metadata['created_at']

    feature_union = model_pipeline._feature_union
    model = model_pipeline._pipeline
    class_idx = np.where(
        model.classes_ == 1)[0][0]  # index of '1' pred in .predict_proba

    for url_object in message['urls']:
        readability_score(url_object)
        for sentence in url_object['sentences']:
            sentence_clean = sentence['s_clean']
            pos = ' '.join([
                i[i.find('/') + 1:] for i in sentence['pos_tags'].split()
            ])  # just pos tags

            sentence_feature_union = feature_union.transform({
                's_clean': [sentence['s_clean']],
                'pos': [pos]
            })

            # metadata
            sentence['model_creation_date'] = created_at

            # predictions from model
            sentence['patterns'] = match_wordnik_rules(sentence_clean)
            sentence['frd'] = model.predict(sentence_feature_union)[0]
            sentence['frd_likelihood'] = round(
                model.predict_proba(sentence_feature_union)[0][class_idx],
                4)  # P(Classification as FRD)

    return write_message('save', message)