Example #1
0
def vectorize(docs, model):

    try:

        mecab = MeCab.Tagger("-Ochasen")
        data = mecab.parse(docs)
        data = StringIO(data.replace("\"", ""))
        data = pd.read_csv(data, sep='\t', header=None)
        data = data.loc[(data[3].str.find("名詞") >= 0)
                        & (data[3].str.find("接頭詞") < 0) &
                        (data[3].str.find("サ変接続") < 0)]
        data = data[data[0].isin(
            pd.DataFrame.from_dict(model.wv.vocab, orient='index').index)]
        data = data[[0]]

        vec = pd.DataFrame(
            data.applymap(model.wv.word_vec)[0].values.mean(axis=0))

        return vec

    except:

        return []