def search_emails(self): # use try/finally to show results in case a cancel occurs try: urls = self.select_urls() wc = webcrawler(urls, self._regexps, True, self.depth, self.proxy) wc.crawl() finally: # forbiden string in 'first_name last_name' forbiden_subtrings = ['escribi', 'resumes', 'write', 'email', 'cv ', 'comments', 'e-mail', 'contact', 'support', 'him ', 'please', ' from'] forbiden_suffixes = [' to', ' list', ' lists', ' as', ' a'] # normalize found addresses for email in wc.matches: # convert result to lowercase, since the DB is case insensitive email = email.lower() # replace ' at ' with '@' and ' dot ' with '.' email = email.replace(' at ', '@').replace(' dot ', '.') # remove <b> and </b> tags email = email.replace('<b>', '').replace('</b>', '') # replace '@' with '@' email = email.replace('@', '@') if len(re.findall('@', email)) > 1: continue s = email.split(' ') if len(s) > 1: # has first_name last_name names = s[:2] email = ' '.join(s[2:]) for forb in forbiden_subtrings: if forb in ' '.join(names): names = [] for forb in forbiden_suffixes: if ' '.join(names).endswith(forb): names = [] # no numbers if re.search('[0-9]+', ' '.join(names)): names = [] else: names = [] email = ' '.join(s) if email.endswith('.'): email = email[:-1] email = email.replace('>','') if not '...' in email: if not email in self.emails or self.emails[email] == '': self.emails[email] = ' '.join(names) elif self.canFixEmail(email, names): email = self.fixEmail(email, names) if not email in self.emails or self.emails[email] == '': self.emails[email] = ' '.join(names)
import data_load import searcher import indexer import webcrawler data_load.traverser() webcrawler.webcrawler() d = indexer.process_data("raw_data.pickle", "webdata.pickle") searcher.search("fortune_shelve")
import data_load import searcher import indexer import webcrawler data_load.traverser() webcrawler.webcrawler() d=indexer.process_data("raw_data.pickle","webdata.pickle") searcher.search("fortune_shelve")