コード例 #1
0
ファイル: transforms.py プロジェクト: vpolimenov/conceptnet5
def choose_small_vocabulary(big_frame, concepts_filename, language):
    """
    Choose the vocabulary of the small frame, by eliminating the terms which:
     - contain more than one word
     - are not in ConceptNet
     - are not frequent
    """
    concepts = set(line.strip() for line in open(concepts_filename))
    vocab = []
    for term in big_frame.index:
        if '_' not in term and term in concepts:
            try:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='large')
            except LookupError:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='combined')
            vocab.append((term, frequency))
    small_vocab = [
        term for term, frequency in sorted(
            vocab, key=lambda x: x[1], reverse=True)[:50000]
    ]
    return small_vocab
コード例 #2
0
def get_vector(frame, label, language=None):
    """
    Returns the row of a vector-space DataFrame `frame` corresponding
    to the text `text`. If `language` is set, this can take in plain text
    and normalize it to ConceptNet form. Either way, it can also take in
    a label that is already in ConceptNet form.
    """
    if frame.index[0].startswith('/'):  # This frame has URIs in its index
        if not label.startswith('/'):
            label = standardized_uri(language, label)
        try:
            return frame.loc[label]
        except KeyError:
            return pd.Series(index=frame.columns)
    else:
        if label.startswith('/'):
            label = uri_to_label(label)
        try:
            return frame.loc[replace_numbers(label)]
        except KeyError:
            # Return a vector of all NaNs
            return pd.Series(index=frame.columns)