Esempio n. 1
0
def data_base2frame(doc2vec_model, min_blogs):
    conn = db_tools.get_conn()
    query = 'SELECT id, claps, blog_url, tags, author, pub_date, title FROM mediumcleanfull ORDER BY id'
    # get blogdata df
    rows = conn.execute(query).fetchall()
    blogdf = pd.DataFrame(rows, columns=['id','claps','url', 'tags', 'author', 'pub_date', 'title'])
    blogdf['channel'] = blogdf['url'].map(lambda x: x.split('/')[3])

    remove_bad_chars = lambda word: re.sub('[{}"]', '', word)
    blogdf['tags'] = blogdf['tags'].map(remove_bad_chars)
    # blogdf['tags'] = blogdf['tags'].map(lambda x: x.replace(',', ', '))
    # pudb.set_trace()
    doc_vectors = doc2vec_model.docvecs
    # tags = list(doc_vectors.doctags.keys())
    # tag2id = list(map(lambda x: int(x.split('_')[1])+1, tags))
    # embedded_vectors = [doc_vectors[key] for key in tags]
    word_cols = ['dim%d'%x for x in range(len(doc_vectors[0]))]
    embedded_vectors = pd.DataFrame(np.asarray(doc_vectors), columns=word_cols)
    blogdf = pd.concat([blogdf, embedded_vectors], axis=1)
    #
    n_blogs = blogdf['channel'].value_counts()
    keep_channels = n_blogs[n_blogs>=min_blogs].index
    # blog_strength = blogdf.loc[:,word_cols].abs()
    keep_channels = keep_channels.drop('the-nib') ### embedding vectors are almost all zero
    channeldf = blogdf.loc[blogdf['channel'].isin(keep_channels), :]
    return channeldf, word_cols
Esempio n. 2
0
def main(fname='../doc2vec.model'):
    cores = multiprocessing.cpu_count()
    conn = db_tools.get_conn()
    query = 'SELECT cleantext from mediumcleanfull ORDER BY id'

    embedder = DocEmbedder()
    embedder.train_model()
    embedder.save_model()
Esempio n. 3
0
 def open_spider(self, spider):
     # print('starting connection')
     # username = '******'
     # with open('/home/jdechery/.postgrespw.txt','r') as f:
     #     password = f.readline()[:-1]
     # # password = ''     # change this
     # host     = 'localhost'
     # port     = '5432'            # default port that postgres listens on
     # db_name  = 'blogs'
     # engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name))
     # self.conn = engine.connect()
     self.conn = db_tools.get_conn()
     self.query = text("""INSERT into mediumblogfull
                       (blog_url, textcontent,
                       img_url, img_path, title, claps,
                       author, pub_date, tags, channel)
                       VALUES (:blog_url, :textcontent,
                       :img_url, :img_path, :title, :claps,
                       :author, :pub_date, :tags, :channel)""")
Esempio n. 4
0
from Mediumrare import gensim_nlp, predictor_model, db_tools
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
import re
import pandas as pd
import numpy as np
import scipy.spatial.distance as dist
import matplotlib.pyplot as plt
import matplotlib
# %% load tags
conn = db_tools.get_conn()
tag_query = 'SELECT id, tags, claps, cleantext from mediumcleanfull ORDER BY id'

blogrows = conn.execute(tag_query).fetchall()
remove_bad_chars = lambda word: re.sub('[{}"]', '', word)

tags = [remove_bad_chars(row[1]) for row in blogrows]
claps = [row[2] for row in blogrows]
ids = [row[0] for row in blogrows]

countvectorizer = CountVectorizer(input='content',
                                  strip_accents='unicode',
                                  min_df=2)
tag_counts = countvectorizer.fit_transform(tags)
# tag_counts = TfidfTransformer().fit_transform(tag_counts)
voc = countvectorizer.vocabulary_

tagdf = pd.DataFrame(data=tag_counts.todense(), columns=voc)
tagdf['claps'] = claps
tagdf['id'] = ids
tagdf['tags'] = tags
# %% get training examples