def test_categorize_relative_urls_provided(self): p = LinkParser() p.feed(base_url='http://feeds.huffingtonpost.com', html=''' <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" /> ''') self.assertEqual(p.find_base_url(), 'http://feeds.huffingtonpost.com') self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])
def xtest_500plus_links(self): p = LinkParser() input_html = read_file('01_input.html') p.feed(input_html, timeout=60) output_json = read_file('01_output.json') data = json.loads(output_json) self.assertSetEqual(set(data[RSS_KEY]), set(p.data[RSS_KEY])) self.assertSetEqual(set(data[ATOM_KEY]), set(p.data[ATOM_KEY]))
def __crawlPage(self, pageName): fullPageName = pageName # If the page is a url go directly there otherwise prepend the domain if(pageName.find('://')!=-1): page = self.__getPage(pageName) else: page = self.__getPage(self.domain+pageName) # Some link urls will be in the format /page1/page2 # we remove this to avoid http://site//page1/page2 if pageName.startswith('/'): pageName = pageName[1:] parser = LinkParser(self.domain) parser.feed(page) pageLinks = parser.getLinks() self.discovered = self.discovered.union(pageLinks) # Convert links to list for later json serialisation self.map.append({'page': fullPageName, 'links': list(pageLinks)})
def crawlAllUrl(self,outputFlag = False,crawlAmountLimit = CRAWL_AMOUNT_LIMIT): while len(Crawler.urlList)>0: Crawler.urlRecordLock.acquire()#lock the queue when loading the first element url = Crawler.urlList.pop() pathname = self.url2Pathname(url) Crawler.urlNotDone.pop(pathname) if Crawler.crawledAmount >= crawlAmountLimit: Crawler.urlRecordLock.release() break Crawler.urlRecordLock.release() result = self.crawlUrl(NORMAL_SITE,url,outputFlag) try: urlArr = urlparse.urlparse(url) #if can not crawl the url, accumulate to the errorCounter if result == False: Crawler.urlRecordLock.acquire() if Crawler.errorCounter.has_key(urlArr.netloc): Crawler.errorCounter[urlArr.netloc]+=1 else: Crawler.errorCounter[urlArr.netloc] = 1 Crawler.urlRecordLock.release() continue if Crawler.errorCounter[urlArr.netloc]> MIN_ERRORS_ALLOWED_FOR_A_SITE: continue _path = urlArr.path rightMostSlashIndex = _path.rfind('/') replaced = _path[rightMostSlashIndex : len(_path)] #try to parse relative address if replaced.find('.') != -1: _path = _path.replace(replaced,'') hostPath = urlArr.scheme + '://' + urlArr.netloc + _path parser = LinkParser() parser.setFlag(NORMAL_SITE) parser.setHostPath(hostPath) parser.feed(result) urlList = parser.hrefsList Crawler.urlRecordLock.acquire() self.addUrlList(urlList) Crawler.crawledAmount += 1 Crawler.urlRecordLock.release() parser.close() except Exception, e: #print(e) self.reportError(url, msg[ERROR_HTML_PARSE])
def request_data(endpoint, request, timeout): next_link = endpoint + request token = get_auth_token() headers = {'Authorization': token} response = [] while next_link != '': try: r = requests.get(next_link, headers=headers, timeout=(3, timeout)) except requests.exceptions.Timeout as e: logger.warn('Request %s timed out after %f seconds.', next_link, timeout) return [598, response] except requests.exceptions.ConnectionError as e: logger.error('Caught %s', e.message) app_config.request_connection_failure = True raise app_config.request_connection_failure = False if r.status_code == 200: response.append(r.text) if 'link' in r.headers: link_string = r.headers['link'] lp = LinkParser(link_string) next_link = lp.get_link('next') else: next_link = '' else: logger.warn('Failed request with status code %d', r.status_code) return [r.status_code, response] return [200, response]
def test_multiple_runs(self): p = LinkParser() p.feed(''' <link rel="canonical" href="http://feeds.huffingtonpost.com" /> <link rel="alternate" type="application/rss+xml" title="The Full Feed" href="huffingtonpost/raw_feed" /> ''') p.feed( '<link rel="alternate" type="application/atom+xml" href="http://feeds.feedburner.com/PTCC" />' ) self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY]) self.assertListEqual([SAMPLE_ATOM], p.data[ATOM_KEY])
def test_all_links(self): link = '<https://api.github.com/organizations/913567/repos?page=3>; rel="next", <https://api.github.com/organizations/913567/repos?page=5>; rel="last", <https://api.github.com/organizations/913567/repos?page=1>; rel="first", <https://api.github.com/organizations/913567/repos?page=1>; rel="prev"' lp = LinkParser(link) self.assertEqual( lp.get_link('next'), 'https://api.github.com/organizations/913567/repos?page=3') self.assertEqual( lp.get_link('last'), 'https://api.github.com/organizations/913567/repos?page=5') self.assertEqual( lp.get_link('prev'), 'https://api.github.com/organizations/913567/repos?page=1') self.assertEqual( lp.get_link('first'), 'https://api.github.com/organizations/913567/repos?page=1')
def main(): initResult = init.initGlobal() crawler = Crawler() if(initResult != False): #input print("Please enter your keyword") keyword = raw_input() keyword = keyword.replace(' ','+') #start crawling from search engine crawler = Crawler() startTime = time.time() crawler.loadRecord(LOG_OF_CRAWLED_URL) crawler.loadRecord(LOG_OF_CRAWLED_CONTENT) crawler.addSearchEngineUrl(keyword) htmlcode = crawler.crawlUrl(GOOGLE) parser = LinkParser() parser.setFlag(GOOGLE) parser.feed(htmlcode) top10 = parser.hrefsList crawler.addUrlList(top10,GOOGLE) parser.close() threadPool = [] # run the work with THREAD_NUM threads while len(threadPool) <= THREAD_NUM: th = threading.Thread(None,crawl) threadPool.append(th) for item in threadPool: item.start() for item in threadPool: item.join() crawler.flush() endTime = time.time() print("time used:") print(endTime-startTime) keyword = raw_input()
def test_empty_string(self): lp = LinkParser("") self.assertEqual(lp.get_link('next'), '') self.assertEqual(lp.get_link('prev'), '') self.assertEqual(lp.get_link('last'), '') self.assertEqual(lp.get_link('first'), '')
def test_mal_formed_link_missin_equal(self): link = '<https://api.github.com/organizations/913567/repos?page=3>; rel~"next"' lp = LinkParser(link) self.assertEqual(lp.get_link('next'), '')
def test_mal_formed_link_missing_close_angle_bracket(self): link = '<https://api.github.com/organizations/913567/repos?page=3; rel="next"' lp = LinkParser(link) self.assertEqual(lp.get_link('next'), '')
def load_link_parser(self, path): """loads link parser""" self.lparser = LinkParser(path=path)
class FeatureLoader: """loads features for sentences""" def __init__(self): """init dictionary""" # note that this dict recognizes more british spellings than en_US but not all self.dict = enchant.Dict('en') def load_link_parser(self, path): """loads link parser""" self.lparser = LinkParser(path=path) def process_line(self, line): """returns feature dict for line""" line = line.strip() tokens = line.lower().split() features = {'length': len(tokens)} if len(line) == 0: return features try: features.update(self.feats_spelling(tokens)) except: print >> sys.stderr, 'Error extracting spelling feats' print >> sys.stderr, traceback.format_exc() try: features.update(self.feats_link(' '.join(tokens))) except: print >> sys.stderr, 'Error extracting link feats' print >> sys.stderr, traceback.format_exc() try: features.update(self.feats_ngram_lm(tokens)) except: print >> sys.stderr, 'Error extracting ngram/lm feats' print >> sys.stderr, traceback.format_exc() return features def process_file(self, fpath): """iterate through file and extract features by line""" all_features = [] with open(fpath) as f: for i, l in enumerate(f): if i % 100 == 0: print i all_features.append(self.process_line(l)) def feats_spelling(self, tokens): """get spelling features""" n = 0 miss = 0 for s in tokens: if s.isalpha(): n += 1 if not self.dict.check(s): miss += 1 return { 'num_miss': miss, 'prop_miss': 1.0 * miss / max(1, n), 'log_miss': log(miss + 1) } def load_lms(self, gpath, tpath): """load language models""" self.gigalm = kenlm.LanguageModel(gpath) self.toefllm = kenlm.LanguageModel(tpath) def get_ngram_prob(self, tokens): """get smoothed ngram prob from gigaword LM""" return self.gigalm.score(' '.join(tokens), bos=False, eos=False) def get_sent_prob(self, tokens, lm): oovs = 0 score = 0 for s in lm.full_scores(' '.join(tokens)): if s[2]: oovs += 1 score += s[0] return oovs, score def feats_ngram_lm(self, tokens): """extract ngram and lm features""" features = {} for n in range(1, 4): if n > len(tokens): continue ngrams = Counter( [tuple(tokens[i:i + n]) for i in xrange(len(tokens) + 1 - n)]) probabilities = [self.get_ngram_prob(ng) for ng in ngrams] features['min_s_%d' % n] = min(probabilities) features['max_s_%d' % n] = max(probabilities) features['sum_s_%d' % n] = sum(probabilities) / sum(ngrams.values()) features['giga_oov'], features['giga_p'] = self.get_sent_prob( tokens, self.gigalm) features['toefl11_oov'], features['toefl11_p'] = self.get_sent_prob( tokens, self.toefllm) return features def feats_link(self, l): """extract link parser feature""" return {'complete_link': self.lparser.has_parse(l)} def get_next_block(self, infile): """return the next block of lines in a file until a blank line is read specifically for dealing with the parser output""" lines = None while True: l = infile.readline().strip() if l.strip() == '(())': raise TooLongError('Sentence too long to parse') if not l or len(l) == 0: return lines if lines is None: lines = [] lines.append(l) return lines def load_parse_features(self, f): """read the stanford parser output to get parse features""" ret = [] with open(f) as infile: while True: try: next_parse = self.get_next_block(infile) except TooLongError: ret.append(None) continue if not next_parse: break if next_parse[0].startswith('#'): features = {} features['parse'] = next_parse[1] if next_parse[0].endswith('NA'): continue features['parse_score'] = float(next_parse[0].split()[-1]) features['sentential_top_node'] = next_parse[1].split( )[1][1] == 'S' features['dep_count'] = sum([ 1 if l.startswith('dep') else 0 for l in next_parse[2:] ]) ret.append(features) return ret def load_hpsg_features(self, fpath): """given a path to the output of ./cheap, return a list of dictionaries that contain the specified features of each sentence""" features = None for line in open(fpath): if ':' not in line: continue key, value = line.split(':', 1) if key == 'id': # marks a new sentence if features: yield (features) features = {} elif key in HPSG_FEATURES: features[key] = log(value + 1) yield (features)
def test_empty_href(self): p = LinkParser() p.feed('<a href>test</a><link href><a href="' + SAMPLE_RSS + '"></a>') self.assertListEqual([SAMPLE_RSS], p.data[RSS_KEY])