Esempio n. 1
0
def generatedata(path: str, testprop=0, parallel=False):
    """
    Return parsed and formatted sequences X,y to pass to python-crfsuite
    X is a list of dictionaries containing features for each word
    y is a list of labels with IOB tags
    
    If testprop>0 is specified, split X,y into training and testing sets
    Return X_train, y_train, X_test, y_test (in that order)
    """

    df = pd.read_csv(path)
    # Filter entries whose original entry (input) or ingredient name are missing
    df = df.loc[pd.notna(df.name) & pd.notna(df.input)]

    if parallel:

        from pandarallel import pandarallel
        pandarallel.initialize(verbose=False)

        df.input = df.input.parallel_apply(
            lambda line: tokenize(line, preprocess=True))
        labels = df.parallel_apply(matchtags, axis=1)
        ind = np.random.choice([True, False],
                               size=len(labels),
                               p=[1 - testprop, testprop])

        features = df.input.parallel_apply(getfeatures)
        ioblabels = labels.parallel_apply(iobtag)

    else:

        df.input = df.input.apply(lambda line: tokenize(line, preprocess=True))
        labels = df.apply(matchtags, axis=1)
        ind = np.random.choice([True, False],
                               size=len(labels),
                               p=[1 - testprop, testprop])

        features = df.input.apply(getfeatures)
        ioblabels = labels.apply(iobtag)

    X_train = list(chain.from_iterable(features[ind]))
    y_train = list(chain.from_iterable(ioblabels[ind]))
    X_test = list(chain.from_iterable(features[np.invert(ind)]))
    y_test = list(chain.from_iterable(ioblabels[np.invert(ind)]))

    if testprop == 0: return X_train, y_train
    return X_train, y_train, X_test, y_test
def TIL_corpus_iter(skip=1):
    cmd_select = "SELECT unprocessed_wikitext FROM training"

    cursor = conn_train.execute(cmd_select)
    for k,(text,) in enumerate(cursor):
        if k%skip==0:
            text = unicode(text)
            text = ' '.join(set(tokenize(text)))
            yield text.split()
def TIL_corpus_iter(skip=1):
    cmd_select = "SELECT unprocessed_wikitext FROM training"

    cursor = conn_train.execute(cmd_select)
    for k, (text, ) in enumerate(cursor):
        if k % skip == 0:
            text = unicode(text)
            text = ' '.join(set(tokenize(text)))
            yield text.split()
Esempio n. 4
0
def getfeatures(line):

    if type(line) is str: line = tokenize(line, preprocess=True)

    features = []
    comma = False
    isparenthetical = False

    for i in range(len(line)):

        token = line[i]
        if token == ')': isparenthetical = False

        token_features = {
            'token': token.lower(),
            'capitalized': token.istitle(),
            'parenthetical': isparenthetical,
            'unit': isunit(token),
            'numeric': isquantity(token),
            'symbol': token in symbols,
            'followscomma': comma
        }

        if (i == 0):
            prev_features = {'start': True}
        else:
            prv = line[i - 1]
            prev_features = {
                '-1token': prv.lower(),
                '-1capitalized': prv.istitle(),
                '-1numeric': isquantity(prv),
                '-1symbol': prv in symbols
            }

        if (i == len(line) - 1):
            next_features = {'end': True}
        else:
            nxt = line[i + 1]
            next_features = {
                '+1token': nxt.lower(),
                '+1capitalized': nxt.istitle(),
                '+1numeric': isquantity(nxt),
                '+1symbol': nxt in symbols
            }

        token_features.update(prev_features)
        token_features.update(next_features)
        features.append(token_features)

        if not isparenthetical and token == ',': comma = not comma
        if token == '(': isparenthetical = True

    return features
Esempio n. 5
0
def matchtags(row):
    """
    Match each token in the input (raw text) to the appropriate label, if one exists
    - We attempt to match singular and pluralized tokens ("shallot", "shallots")
    - Matching of fractions and floats are handled (1 1/2, 1.50)
    - We attemps to match units in alternative representations (tbsp, T, tablespoon)
    Return list of labels
    """

    ingr_tokens = tokenize(row["name"], preprocess=True)
    unit_tokens = tokenize(row["unit"], preprocess=True)
    #comment_tokens = tokenize(row["comment"], preprocess=True)

    labels = []

    for token in row["input"]:

        if asfloat(token) == row["qty"]:
            labels.append("QTY")

        elif round_2f(asfloat(token)) == row["range_end"]:
            labels.append("QTY-UR")

        elif any(
                tokenmatch(standardize(token).lower(), u.lower())
                for u in unit_tokens):
            labels.append("UNIT")

        elif any(tokenmatch(token.lower(), i.lower()) for i in ingr_tokens):
            labels.append("INGR")


#         elif token.lower() in comment_tokens:
#             labels.append("CMNT")

        else:
            labels.append(None)

    return labels
def token_block_iter(title,html):

    wiki  = paragraph_iter(html)

    token_table = map(tokenize, wiki)
    freq = frequency_table(token_table)

    wiki_title = unicode(wiki.title)
    wiki_title_tokens = set(tokenize(wiki_title))

    wiki_index = wiki.id

    # Column normalize frequency table
    freq /= freq.sum(axis=0)

    for para_n,tokens in enumerate(token_table):
        tokens = set(tokens).difference(wiki_title_tokens)
        tokens = list(tokens)
        
        local_freq = freq[tokens].ix[para_n]
        yield para_n, tokens, wiki_index, wiki_title, local_freq
Esempio n. 7
0
def token_block_iter(title, html):

    wiki = paragraph_iter(html)

    token_table = map(tokenize, wiki)
    freq = frequency_table(token_table)

    wiki_title = unicode(wiki.title)
    wiki_title_tokens = set(tokenize(wiki_title))

    wiki_index = wiki.id

    # Column normalize frequency table
    freq /= freq.sum(axis=0)

    for para_n, tokens in enumerate(token_table):
        tokens = set(tokens).difference(wiki_title_tokens)
        tokens = list(tokens)

        local_freq = freq[tokens].ix[para_n]
        yield para_n, tokens, wiki_index, wiki_title, local_freq
Esempio n. 8
0
 def setUp(self):
     expression = '(abc)'
     self.expression_tokenized = tokenize(expression)
Esempio n. 9
0
    base_url = "http://en.wikipedia.org/w/index.php?title={}"

    try:
        url = base_url.format(urllib.quote(title))
        print u"# [{:s}]({:s})".format(title,url)
        print text.encode('utf-8')
        print
    except:
        pass


for item in conn_decoy.execute(cmd_select_positive_scores):
    idx,wiki_idx,title,para_n = item

    html = conn_wiki.execute(cmd_select_wiki, (title,)).next()[0]
    paragraphs = list(paragraph_iter(html))
    title = unicode(title)

    if paragraphs:
        print title
        text = paragraphs[para_n]
        tokens = u' '.join(tokenize(text))

        conn_report.execute(cmd_insert_into_report,
                            (title, text, tokens))

conn_report.commit()



Esempio n. 10
0
cmd_select_wiki = '''SELECT text FROM wiki WHERE title=?'''


def pprint_item(title, text):
    base_url = "http://en.wikipedia.org/w/index.php?title={}"

    try:
        url = base_url.format(urllib.quote(title))
        print u"# [{:s}]({:s})".format(title, url)
        print text.encode('utf-8')
        print
    except:
        pass


for item in conn_decoy.execute(cmd_select_positive_scores):
    idx, wiki_idx, title, para_n = item

    html = conn_wiki.execute(cmd_select_wiki, (title, )).next()[0]
    paragraphs = list(paragraph_iter(html))
    title = unicode(title)

    if paragraphs:
        print title
        text = paragraphs[para_n]
        tokens = u' '.join(tokenize(text))

        conn_report.execute(cmd_insert_into_report, (title, text, tokens))

conn_report.commit()
Esempio n. 11
0
def find_TIL_match(js):
    text = js["wiki"]
    url = js["url"]

    wiki_title = url.split('/')[-1].replace('_',' ').lower()
    wiki_title = wiki_title.split('#')[0]
    wiki_title = unidecode(urllib2.unquote(wiki_title))
    wiki_title_tokens = set(wiki_title.split())
   
    TIL_text = js["title"]
    TIL_tokens = set(tokenize(TIL_text))

    # Remove special tokens from TIL
    TIL_tokens = [x for x in TIL_tokens if len(x)>2 and "TOKEN_" not in x]
    
    paragraphs = list(split_by_paragraph(text))
    tokens = map(tokenize,paragraphs)
    freq = frequency_table(tokens)

    # Find words in TIL used in text
    matching_columns = list(set(freq.columns).intersection(TIL_tokens))

    # Idea: Score each paragraph with the highest ranked match
    df = freq[matching_columns]
    
    # Row normalize, thus unique words count for more!
    df /= df.sum(axis=0)
    df.fillna(0,inplace=True)


    # Find the top scoring paragraph
    score = df.sum(axis=1)
    top_idx = score.argmax()
    match_text = paragraphs[top_idx]

    # Now, normalize off the full frequency table for the entropy weight
    freq /= freq.sum(axis=0)
    freq.fillna(0,inplace=True)
    tokens = list(freq.columns[freq.ix[top_idx]>0])
    weights = freq[tokens].ix[top_idx]

    # Convert them into SQL-able formats
    w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights)))

    d_out = {
        "reddit_idx" : js["name"],
        "TIL"        : TIL_text,
        "unprocessed_wikitext"   : match_text,
        "tokens"   : ' '.join(tokens),
        "url"        : url,
        "score"      : js["score"],
        "weights"    : w_str
    }

    key_order = ["reddit_idx", "TIL",
                 "unprocessed_wikitext", "tokens",
                 "url", "score", "weights"]

    data_match = [d_out[key] for key in key_order]

    # Save the remaining parargraphs
    data_unmatch = []
    
    for n in range(len(paragraphs)):
        if n != top_idx:
            tokens = list(freq.columns[freq.ix[n]>0])
            weights = freq[tokens].ix[n]

            assert(len(tokens)==len(weights))
            if len(tokens)>3:
                # Convert them into SQL-able formats
                w_str='[{}]'.format(','.join(map("{:0.2f}".format, weights)))
                t_str = ' '.join(tokens)            
                data_unmatch.append( [t_str, w_str] )

    return data_match, data_unmatch