def initialize(self, proxies_per_proto={}, user=None, passw=None, debug=False): print 'INIT: TwitterBot' self.__br = Browser() self.__br.set_proxies(proxies_per_proto) self.__br.set_debug_http(debug) self.__debug = debug self.__sandman = SandMan('TwitterBot') self.__ngd = NGD() self.__ngd.set_proxies(proxies_per_proto) self.__lock = Lock() try: # sign in self.__br.open("http://twitter.com/") self.__br.select_form(nr=1) self.__br['session[username_or_email]'] = user self.__br['session[password]'] = passw resp = self.__br.submit() time.sleep(0.2) except Exception, e: if self.__debug: traceback.print_exc(file=sys.stdout) print str(e) print 'EXCEPTION on TwitterBot, possibly bad user/password or https login don\' work behind a proxy.'
class TwitterBot: __favorites_regex = '<a href="http://twitter.com/[a-zA-Z0-9_]+" title="[a-zA-Z0-9\-_ .]+">[a-zA-Z0-9_]+</a>' __following_regex = '<a href="http://twitter.com/[a-zA-Z0-9_]+" rel="contact"><img alt="[a-zA-Z0-9\-_ .]+" class' __fav_complete_name_regex = 'title="[a-zA-Z0-9\-_ .]+">' __fav_complete_name_prefix = 'title="' __fav_complete_name_sufix = '">' __foll_complete_name_regex = 'img alt="[a-zA-Z0-9\-_ .]+" class' __foll_complete_name_prefix = 'img alt="' __foll_complete_name_sufix = '" class' __url_regex = 'href="http://twitter.com/[a-zA-Z0-9_]+" ' __url_prefix = 'href="' __url_sufix = '" ' def __init__(self): pass def initialize(self, proxies_per_proto={}, user=None, passw=None, debug=False): print 'INIT: TwitterBot' self.__br = Browser() self.__br.set_proxies(proxies_per_proto) self.__br.set_debug_http(debug) self.__debug = debug self.__sandman = SandMan('TwitterBot') self.__ngd = NGD() self.__ngd.set_proxies(proxies_per_proto) self.__lock = Lock() try: # sign in self.__br.open("http://twitter.com/") self.__br.select_form(nr=1) self.__br['session[username_or_email]'] = user self.__br['session[password]'] = passw resp = self.__br.submit() time.sleep(0.2) except Exception, e: if self.__debug: traceback.print_exc(file=sys.stdout) print str(e) print 'EXCEPTION on TwitterBot, possibly bad user/password or https login don\' work behind a proxy.'
def initialize(self, proxies_per_proto={}, user=None, passw=None, debug=False): print 'INIT: SearchEngineBot' self.__br = Browser() self.__br.set_proxies(proxies_per_proto) self.__br.set_debug_http(debug) self.__ngd = NGD(proxies_per_proto) self.__harvest_command = EmailHarvestingCommand() self.__harvest_command.set_only_complete_names(False) self.__sandman = SandMan('SearchEngineBot')
def __init__(self, proxies={}, entropy_filter=True, lang='en', entropy_top=3, query_top=100, fraction=5): self.__ngd = NGD(proxies) #self.__ngd.set_context('site:imsdb.com') self.__cache = {} self.__min_ent = 0.0 self.__entropy_filter = entropy_filter self.__lang = lang self.__entropy_top = entropy_top self.__fraction = fraction self.__query_top = query_top self.__translator = Translate() self.__lock = Lock() self.__voc_translator = None random.seed(666)
class ChatBot: def __init__(self, proxies={}, entropy_filter=True, lang='en', entropy_top=3, query_top=100, fraction=5): self.__ngd = NGD(proxies) #self.__ngd.set_context('site:imsdb.com') self.__cache = {} self.__min_ent = 0.0 self.__entropy_filter = entropy_filter self.__lang = lang self.__entropy_top = entropy_top self.__fraction = fraction self.__query_top = query_top self.__translator = Translate() self.__lock = Lock() self.__voc_translator = None random.seed(666) def set_voc_translator(self, voc_trans=None): self.__voc_translator = voc_trans def entropy_min(self, e_min): self.__min_ent = e_min def reply_to(self, chat_line): self.__lock.acquire() try: chat_line = normalize_token(chat_line) if self.__lang != 'en': chat_line = self.__translator.translate( chat_line, self.__lang, 'en') snippets, answers = [], [] while len(answers) == 0: snippets = self.__ngd.snippets_query( '"%s" site:imsdb.com' % chat_line, self.__query_top) answers = self.__extract_answers(snippets, chat_line) if len(answers) == 0: chat_line = chat_line[:-1] if len(chat_line) == 0: break continue probabilities = self.__build_probs(answers) new_ans = [] for i in range(min(len(answers), self.__fraction)): new_ans.append(self.__choose_random_answer(probabilities)) answers = list(set(new_ans)) new_answers = [] for ans in answers: if self.__entropy_filter: val = self.__ngd.distance( ('"%s"' % chat_line, '"%s"' % ans.encode())) if val: print 'search engine distance (choosing response): %s %f' % ( ans, val) time.sleep(0.25) new_answers.append((ans, val)) if self.__entropy_filter: new_answers.sort(second_compare) #new_answers.reverse() new_answers = map(lambda x: x[0], new_answers[:self.__entropy_top]) answers = filter(lambda x: x in new_answers, answers) ans = None if len(answers) > 0: ans = answers[random.randint(0, len(answers) - 1)] if not ans: ans = 'ah' # use vocabulary translator, if available if self.__voc_translator: ans = self.__voc_translator(ans) if ans and self.__lang != 'en': ans = self.__translator.translate(ans, 'en', self.__lang).lower() if not ans: ans = 'ah' return ans finally: self.__lock.release() # release lock, no matter what def __extract_answer(self, snippet, chat_line): # [^\.!?]+ snippet = normalize_token(snippet) snippet = re.sub('\([^\)]+\) ', '', snippet) snippet = re.sub('\[[^\)]+\] ', '', snippet) iterator = re.finditer('[A-Z][A-Z]+ [^\.!?]+[\.!?]', snippet) lines = [] for match in iterator: line = match.group() #print line line_s = line.split(' ') line = ' '.join(line_s[1:]).lower() line = html2text(line) #print line line = line.replace('_', '').replace('\n', '') #line = re.sub( '\([^\)]+\) ', '', line) if not '-' in line and not ':' in line and not '**' in line and not '(' in line and not ')' in line and not '"' in line: if len(line) > 0 and line[-1] == '.': line = line[:-1] lines.append(line) #ret.append(strip(match)) #print strip(match) if len(lines) == 0: return '' prev = lines[0].lower() ret = [] for i in range(1, len(lines)): if chat_line.lower() in prev: ret.append(lines[i].lower()) prev = lines[i].lower() return ret def __extract_answers(self, snippets, chat_line): ret, ret_titles = [], [] for snippet in snippets: anss = self.__extract_answer(snippet, chat_line) for ans in anss: if ans != '': ret.append(ans.strip()) return ret def __build_probs(self, answers): d = {} for ans in answers: if not ans in d: d[ans] = 1 else: d[ans] += 1 ret = [] for ans, cnt in d.iteritems(): ret.append((ans, float(cnt) / len(answers))) return ret def __choose_random_answer(self, probs): rand_float = random.random() sum = 0.0 ret = None for ans, prob in probs: sum += prob if sum >= rand_float: ret = ans break return ret def start(self): msg = '' while msg != 'bye': msg = raw_input('You: ') ans, title = self.reply_to(msg.strip()) print 'end of chat.' def save_cache(self): self.__ngd.save_cache()
from distance import NGD, NMD, NYD import time google = NGD() msn = NMD() yahoo = NYD() def compare( a, b , google, msn, yahoo): g = google.distance(a,b) m = msn.distance(a,b) y = yahoo.distance(a,b) print 'for "%s" "%s"' % (a,b) print 'google: %f msn: %f yahoo: %f' % (g,m,y) print '' pairs = [('by','with'), ('quantum','physics'), ('quantum', 'football')] print time.ctime() b = time.time() #print google.distances((pairs*30)[:21]) a = time.time() print time.ctime() print 'took %d seconds' % (a-b) b = time.time() print google.distances((pairs*30)[:65], True) a = time.time() print time.ctime()
class SearchEngineBot: def __init__(self): pass def initialize(self, proxies_per_proto={}, user=None, passw=None, debug=False): print 'INIT: SearchEngineBot' self.__br = Browser() self.__br.set_proxies(proxies_per_proto) self.__br.set_debug_http(debug) self.__ngd = NGD(proxies_per_proto) self.__harvest_command = EmailHarvestingCommand() self.__harvest_command.set_only_complete_names(False) self.__sandman = SandMan('SearchEngineBot') # no sign in def set_proxies_per_proto(self, proxies): self.__proxies = proxies try: self.__ngd.set_proxies(proxies) except: print 'EXCEPTION on SeachEngineBot, possibly bad user/password or https login don\' work behind a proxy.' if len(proxies) == 0: proxy = None else: proxy = tuple(proxies['http'].split(':')) proxy = (proxy[0], int(proxy[1])) self.__proxy = proxy def set_sleep_secs(self, secs): self.__sandman.set_sleep_secs(secs) def set_sleep_module(self, iterations): self.__sandman.set_sleep_module(iterations) def set_sleep_failure(self, secs): self.__sandman.set_sleep_failure(secs) def set_sleep_random_flag(self, bool): self.__sandman.set_sleep_random_flag(bool) def self_email(self, email, name): if name.lower().startswith(email.split('@')[0].lower()): return True if len(name.split(' ')) == 1 and name.lower() == email.split( '@')[0].lower(): return True if len(name.split(' ')) == 2 and '.'.join( name.split(' ')).lower() == email.split('@')[0].lower(): return True if len(name.split(' ')) == 2 and '_'.join( name.split(' ')).lower() == email.split('@')[0].lower(): return True if len(name.split(' ')) == 2 and name.split( ' ')[0].lower() == email.split('@')[0].lower(): return True if len(name.split(' ')) == 2 and ( name.split(' ')[0][0] + name.split(' ')[1]).lower() == email.split('@')[0].lower(): return True return False def name_to_emails(self, (aliases, graph)): self.__harvest_command.set_only_complete_names(False) return self.__name_to_emails(aliases, 'all_mails')
emails_with_name[e] = n name_with_email[n] = e print "--------------------------------------------------------------------------------" print "emails with name" print str(emails_with_name) def good_mail(mail): s = mail.split("@") return len(s) > 1 and "." in s[1] and not ".." in s[1] # now we retrieve non-domain emails. mails_per_name = {} mails_hits = {} ngd = NGD() for e, n in emails_with_name.iteritems(): mails_per_name[n] = [] emails = search_email_address_grabber() emails.initialize(n, "name2email&domain") emails.targetRun() email_list = emails.finalize() for e2, n2 in email_list: if e2 != e and good_mail(e2): mails_per_name[n].append(e2) mails_hits[e2] = ngd.results(e2) for n, l in mails_per_name.iteritems(): print n print str(l) print "--------"