-
Notifications
You must be signed in to change notification settings - Fork 1
/
email_guess_helper.py
173 lines (159 loc) · 8.25 KB
/
email_guess_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import requests
from parse import Parse
from google import Google
from bs4 import BeautifulSoup
from nameparser import HumanName
import pandas as pd
import arrow
import json
import pystache
from rq import Queue
from worker import conn
q = Queue(connection=conn)
class EmailGuessHelper:
def _remove_non_ascii(self, text):
return ''.join(i for i in text if ord(i)<128)
def _name_to_email_variables(self, name):
name = self._remove_non_ascii(name.strip())
fi = name.split(' ')[0][0]
li = name.split(' ')[-1][0]
first = name.split(' ')[0]
last = name.split(' ')[-1]
vals = [fi, li, first, last, name]
cols = ['first_initial','last_initial','first_name','last_name','name']
return dict(zip(cols,vals))
def _find_email_pattern(self, name, email):
''' Decifer Email Pattern '''
patterns = pd.DataFrame()
print name, email, 'find_email_pattern'
person = self._name_to_email_variables(str(name).strip())
person['domain'] = email.strip().split('@')[-1]
for pattern in self._patterns():
#_email = pattern.format(**person)
_email = pystache.render(pattern, person)
if email.lower() == _email.lower(): return pattern
def _bulk_find_email_pattern(self, domain, people):
''' Decifer Email Pattern '''
patterns = pd.DataFrame()
for index, person in people.iterrows():
for pattern in self._patterns():
#email = pattern.format(**person)
email = pystache.render(pattern, person)
if person['email'].lower() != email.lower(): continue
info = [pattern.strip(),
person['domain'].strip(),
person['email'].lower().strip(),
person['name'].title().strip()]
columns = ['pattern','domain','email','name']
patterns = patterns.append(dict(zip(columns, info)),
ignore_index=True)
return patterns
def _email_crawl_pointers(self, qry):
parse = Parse()
results = parse.get('CompanyEmailPatternCrawl', qry).json()
results = results['results'] if "results" in results.keys() else results
crawls = pd.DataFrame(results)
crawl_objectids = crawls.drop_duplicates('pattern').objectId
crawl_pointers = [parse._pointer('CompanyEmailPatternCrawl', objectId)
for objectId in crawl_objectids]
return crawl_pointers
def _update_prospect_with_email(self, contact, objectId):
print Parse().update('Prospect/'+objectId, contact).json()
print r.json()
''' '''
def _persist_email_guess(self, domain, upload):
''' Different Email Patterns '''
if upload.shape[0] == 0: return 0
for index, row in upload.iterrows():
print Parse().create('CompanyEmailPatternCrawl', row.to_dict()).json()
for domain in upload.domain.drop_duplicates():
qry = {'where':json.dumps({'domain': domain}),'order':'-createdAt'}
crawl_pointers = self._email_crawl_pointers(qry)
patterns = {'company_email_pattern': crawl_pointers}
pattern = Parse().get('CompanyEmailPattern', qry).json()
if pattern['results'] == []:
patterns['domain'] = domain
print Parse().create('CompanyEmailPattern', patterns).json()
else:
pattern = 'CompanyEmailPattern/'+pattern['results'][0]['objectId']
print Parse().update(pattern, patterns)
def _add_email_variables(self, contacts):
contacts['first_name'] = [self._remove_non_ascii(name).strip().split(' ')[0]
for name in contacts.name]
contacts['last_name'] = [self._remove_non_ascii(name).split(' ')[-1]
for name in contacts.name]
contacts['first_initial'] = [self._remove_non_ascii(name).split(' ')[0][0]
for name in contacts.name]
contacts['last_initial'] = [self._remove_non_ascii(name).split(' ')[0][-1]
for name in contacts.name]
return contacts
def _patterns(self):
return ['{{first_name}}@{{domain}}', '{{last_name}}@{{domain}}',
'{{first_initial}}@{domain}}', '{{last_initial}}@{{domain}}',
'{{first_name}}{{last_name}}@{{domain}}',
'{{first_name}}.{{last_name}}@{{domain}}',
'{{first_initial}}{{last_name}}@{{domain}}',
'{{first_initial}}.{{last_name}}@{{domain}}',
'{{first_name}}{{last_initial}}@{{domain}}',
'{{first_name}}.{{last_initial}}@{{domain}}',
'{{first_initial}}{{last_initial}}@{{domain}}',
'{{first_initial}}.{{last_initial}}@{{domain}}',
'{{last_name}}{{first_name}}@{{domain}}',
'{{last_name}}.{{first_name}}@{{domain}}',
'{{last_name}}{{first_initial}}@{{domain}}',
'{{last_name}}.{{first_initial}}@{{domain}}',
'{{last_initial}}{{first_name}}@{{domain}}',
'{{last_initial}}.{{first_name}}@{{domain}}',
'{{last_initial}}{{first_initial}}@{{domain}}',
'{{last_initial}}.{{first_initial}}@{{domain}}',
'{{first_name}}-{{last_name}}@{{domain}}',
'{{first_initial}}-{{last_name}}@{{domain}}',
'{{first_name}}-{{last_initial}}@{{domain}}',
'{{first_initial}}-{{last_initial}}@{{domain}}',
'{{last_name}}-{{first_name}}@{{domain}}',
'{{last_name}}-{{first_initial}}@{{domain}}',
'{{last_initial}}-{{first_name}}@{{domain}}',
'{{last_initial}}-{{first_initial}}@{{domain}}',
'{{first_name}}_{{last_name}}@{{domain}}',
'{{first_initial}}_{{last_name}}@{{domain}}',
'{{first_name}}_{{last_initial}}@{{domain}}',
'{{first_initial}}_{{last_initial}}@{{domain}}',
'{{last_name}}_{{first_name}}@{{domain}}',
'{{last_name}}_{{first_initial}}@{{domain}}',
'{{last_initial}}_{{first_name}}@{{domain}}',
'{{last_initial}}_{{first_initial}}@{{domain}}']
def _old_patterns(self):
return ['{first_name}@{domain}', '{last_name}@{domain}',
'{first_initial}@{domain}', '{last_initial}@{domain}',
'{first_name}{last_name}@{domain}',
'{first_name}.{last_name}@{domain}',
'{first_initial}{last_name}@{domain}',
'{first_initial}.{last_name}@{domain}',
'{first_name}{last_initial}@{domain}',
'{first_name}.{last_initial}@{domain}',
'{first_initial}{last_initial}@{domain}',
'{first_initial}.{last_initial}@{domain}',
'{last_name}{first_name}@{domain}',
'{last_name}.{first_name}@{domain}',
'{last_name}{first_initial}@{domain}',
'{last_name}.{first_initial}@{domain}',
'{last_initial}{first_name}@{domain}',
'{last_initial}.{first_name}@{domain}',
'{last_initial}{first_initial}@{domain}',
'{last_initial}.{first_initial}@{domain}',
'{first_name}-{last_name}@{domain}',
'{first_initial}-{last_name}@{domain}',
'{first_name}-{last_initial}@{domain}',
'{first_initial}-{last_initial}@{domain}',
'{last_name}-{first_name}@{domain}',
'{last_name}-{first_initial}@{domain}',
'{last_initial}-{first_name}@{domain}',
'{last_initial}-{first_initial}@{domain}',
'{first_name}_{last_name}@{domain}',
'{first_initial}_{last_name}@{domain}',
'{first_name}_{last_initial}@{domain}',
'{first_initial}_{last_initial}@{domain}',
'{last_name}_{first_name}@{domain}',
'{last_name}_{first_initial}@{domain}',
'{last_initial}_{first_name}@{domain}',
'{last_initial}_{first_initial}@{domain}']