/
create_wordvec.py
102 lines (78 loc) · 3.42 KB
/
create_wordvec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import config
import sqlalchemy
import pandas as pd
import numpy as np
import requests
import geo_func
from datetime import datetime, timedelta
import bze_util as bze
import ftfy
import spacy
import gensim, logging
import re
from nltk.corpus import stopwords
import nltk.data
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def add_space_symbol(caption):
# add a spce between ): ! ? etc so they are their own symbols
rem_list = ['\'','"','/'," \ ",'|','}','{','[',']','@','^','*','+','-','.','`','!','?',"'",'=',':',')','()'] # ':!)(;#$%'
for ch in rem_list:
caption = str.replace(caption,ch.strip(),' '+ch.strip()+' ')
return caption
def caption_to_wordlist( caption, remove_stopwords=False ):
# Convert a document to a sequence of words, returning a list of words
caption = add_space_symbol(caption)
caption_nohash = re.sub("#", "", caption)
words = fix_text(caption_nohash.lower().split())
if remove_stopwords:
stops = set(stopwords.words("english"))
words = [w for w in words if not w in stops]
return(words)
def caption_to_sentences( caption, tokenizer, remove_stopwords=False ):
# split captions into parsed sentences.
# Use the NLTK tokenizer to split the paragraph into sentences
raw_sentences = tokenizer.tokenize(caption.strip())
sentences = []
for raw_sentence in raw_sentences:
# If a sentence is empty, skip it
if len(raw_sentence) > 0:
sentences.append( caption_to_wordlist( raw_sentence, \
remove_stopwords ))
return sentences
# selecting all posts from US
geo_box = (18.005611, 48.987386, -124.626080, -62.361014)
# connect to server
engine = sqlalchemy.create_engine(
'mysql://%(user)s:%(pass)s@%(host)s' % config.database)
engine.execute('use %s' % config.database['name']) # select db
recent_data = (datetime.now() - timedelta(weeks=12)).strftime("%Y-%m-%d")
sql_query = '''SELECT post_date, latitude, longitude, image_url, likes, caption, post_url
FROM instagram
WHERE post_date > '%s'
AND latitude between %s AND %s
AND longitude between %s AND %s
ORDER BY post_date DESC, likes DESC
''' % (recent_data, geo_box[0], geo_box[1], geo_box[2], geo_box[3])
posts = pd.read_sql_query(sql_query, engine, parse_dates=['date'])
n_points = posts.shape[0]
posts = posts[posts['caption'].notnull()]
posts.reset_index(drop = True)
sentences = [] # Initialize an empty list of sentences
print "Parsing sentences from training set"
for caption in posts['caption']:
sentences += caption_to_sentences(caption, tokenizer)
# Set values for various parameters
num_features = 400 # Word vector dimensionality
min_word_count = 30 # Minimum word count
num_workers = 4 # Number of threads to run in parallel
context = 12 # Context window size
downsampling = 1e-3 # Downsample setting for frequent words
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context, sample = downsampling)
model.init_sims(replace=True)
model_name = "buzzeat_400features_30minwords_12context"
model.save(model_name)