def search_ngram(self, ngram, index): print("%s: Searching for [%s]" % (AcureRateUtils.get_now_as_str(), ngram)) search_url = F6SEngager.F6S_SEARCH_URL % ngram # rc, response, ip = SatoriMain.perform_request(search_url, opener, with_ip=False, should_delay=False) rc, response = self.perform_request(search_url) if rc != 200: print(">>> ERROR: %s: %s." % (rc, response)) return # Check results results = json.loads(response) if results[0]['text'].find(' match') < 1: print("F6S Scraper: No mention of match(es) - %s." % results[0]['text']) return num_matches = int(results[0]['text'].split(' ')[0]) if num_matches == 0: print("F6S Scraper: No hits returned when searching for %s." % ngram) return # Count how many of them are 'Startup' startups_only = [ res for res in results[1:] if 'rightText' in res and res['rightText'] == 'Startup' and 'text' in res and res['text'] and res['text'].lower().find(ngram) == 0 ] if len(startups_only) == 0: return # Should we call recursively if len(startups_only) >= 20: for l in self.all_valid_chars: self.search_ngram(ngram + l, index + 1) print("%s: Found %s results for [%s]. Writing:" % (AcureRateUtils.get_now_as_str(), len(startups_only), ngram)) # Write to file self.extract_and_write(startups_only, ngram) pass
def extract_and_write(self, startups, ngram): # Iterate over all startups for res in startups: text = res['text'] the_type = res['type'] value = res['value'] if text.find(';') == 0: text = "'%s'" % text if text.lower().find(ngram) == 0: self.companies_file.write('%s; %s; %s\n' % (text, the_type, value)) now_str = AcureRateUtils.get_now_as_str() print('%s: %s, %s, %s' % (now_str, text, the_type, value)) self.companies_file.flush() pass