def data_base2frame(doc2vec_model, min_blogs): conn = db_tools.get_conn() query = 'SELECT id, claps, blog_url, tags, author, pub_date, title FROM mediumcleanfull ORDER BY id' # get blogdata df rows = conn.execute(query).fetchall() blogdf = pd.DataFrame(rows, columns=['id','claps','url', 'tags', 'author', 'pub_date', 'title']) blogdf['channel'] = blogdf['url'].map(lambda x: x.split('/')[3]) remove_bad_chars = lambda word: re.sub('[{}"]', '', word) blogdf['tags'] = blogdf['tags'].map(remove_bad_chars) # blogdf['tags'] = blogdf['tags'].map(lambda x: x.replace(',', ', ')) # pudb.set_trace() doc_vectors = doc2vec_model.docvecs # tags = list(doc_vectors.doctags.keys()) # tag2id = list(map(lambda x: int(x.split('_')[1])+1, tags)) # embedded_vectors = [doc_vectors[key] for key in tags] word_cols = ['dim%d'%x for x in range(len(doc_vectors[0]))] embedded_vectors = pd.DataFrame(np.asarray(doc_vectors), columns=word_cols) blogdf = pd.concat([blogdf, embedded_vectors], axis=1) # n_blogs = blogdf['channel'].value_counts() keep_channels = n_blogs[n_blogs>=min_blogs].index # blog_strength = blogdf.loc[:,word_cols].abs() keep_channels = keep_channels.drop('the-nib') ### embedding vectors are almost all zero channeldf = blogdf.loc[blogdf['channel'].isin(keep_channels), :] return channeldf, word_cols
def main(fname='../doc2vec.model'): cores = multiprocessing.cpu_count() conn = db_tools.get_conn() query = 'SELECT cleantext from mediumcleanfull ORDER BY id' embedder = DocEmbedder() embedder.train_model() embedder.save_model()
def open_spider(self, spider): # print('starting connection') # username = '******' # with open('/home/jdechery/.postgrespw.txt','r') as f: # password = f.readline()[:-1] # # password = '' # change this # host = 'localhost' # port = '5432' # default port that postgres listens on # db_name = 'blogs' # engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(username, password, host, port, db_name)) # self.conn = engine.connect() self.conn = db_tools.get_conn() self.query = text("""INSERT into mediumblogfull (blog_url, textcontent, img_url, img_path, title, claps, author, pub_date, tags, channel) VALUES (:blog_url, :textcontent, :img_url, :img_path, :title, :claps, :author, :pub_date, :tags, :channel)""")
from Mediumrare import gensim_nlp, predictor_model, db_tools from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer import re import pandas as pd import numpy as np import scipy.spatial.distance as dist import matplotlib.pyplot as plt import matplotlib # %% load tags conn = db_tools.get_conn() tag_query = 'SELECT id, tags, claps, cleantext from mediumcleanfull ORDER BY id' blogrows = conn.execute(tag_query).fetchall() remove_bad_chars = lambda word: re.sub('[{}"]', '', word) tags = [remove_bad_chars(row[1]) for row in blogrows] claps = [row[2] for row in blogrows] ids = [row[0] for row in blogrows] countvectorizer = CountVectorizer(input='content', strip_accents='unicode', min_df=2) tag_counts = countvectorizer.fit_transform(tags) # tag_counts = TfidfTransformer().fit_transform(tag_counts) voc = countvectorizer.vocabulary_ tagdf = pd.DataFrame(data=tag_counts.todense(), columns=voc) tagdf['claps'] = claps tagdf['id'] = ids tagdf['tags'] = tags # %% get training examples