def main(): urls = utils.read_url("input.csv") i = 1 for url in urls: print("Processing", url) utils.write_url("output.csv", process_url(url), fieldnames) i += 1
def _gather_http(self): """get internal nodes information via http ot https, delete the first line""" url = utils.apply_template(self.provider, self.values) result = utils.read_url(url) if result: del result[0] result = ''.join(result) return self._return(yaml.load(result))
def get_recommended_tbb_version(): """Get the recommended TBB version from RecommendedTBBVersions file.""" tbb_versions_url = "https://www.torproject.org/projects/torbrowser/RecommendedTBBVersions" # noqa versions = ut.read_url(tbb_versions_url) for line in versions.split(): if "Linux" in line: return line.split("-")[0].lstrip('"') raise TBBGetRecommendedVersionError()
def get_recommended_tbb_version(): """Get the recommended TBB version from RecommendedTBBVersions file.""" tbb_versions_url = "https://www.torproject.org/projects/torbrowser/RecommendedTBBVersions" # noqa versions = ut.read_url(tbb_versions_url) for line in versions.split(): if "Linux" in line: return line.split("-")[0].lstrip('"') raise cm.TBBGetRecommendedVersionError()
def extract_text(url): prefix = 'https://commoncrawl.s3.amazonaws.com/' with utils.opengz(utils.read_url(url)) as f: for i, line in enumerate(f): url = f'{prefix}{line.strip()}' print(f'Processing {url}') for text in read_wet(url): yield text
def __get_lyrics(self, artist, track): while (time.time() - self.last_accessed) < 20: time.sleep(1) self.last_accessed = time.time() searchUrl = "http://api.chartlyrics.com/apiv1.asmx/SearchLyricDirect?artist={}&song={}" searchUrl = searchUrl.format(artist, track) xml = minidom.parseString(read_url(searchUrl)) res = xml.getElementsByTagName("GetLyricResult")[0] lyrics = res.getElementsByTagName("Lyric")[0].childNodes[0].data return str(lyrics)
def read_wet(url, skip=2): text = [] record = False with utils.opengz(utils.read_url(url)) as f: for line in f: if line.startswith('Content-Length:'): record = True continue elif line.startswith('WARC/1.0'): record = False if skip <= 0: yield ''.join(text).strip() skip -= 1 text = [] if record == True: text.append(line)
def scrape_and_collect(url): logger.info("started scrapping : {}".format(url)) # read url and get response body html = read_url(url=url) logger.debug("page content : \n {}".format(html)) # check if response is not blank, if blank then no need to process further if html == "": logger.error("empty response from URL, please provide valid URL") return dict() # initialize HtmlParser object parser = CustomHtmlParser() # feed html and collect elements parser.feed_and_collect(html=html) # set result as dictionary result = dict() result["total_elements"] = parser.total_elements result["top_elements"] = parser.top_elements(number=5) return result
def get_echonest_track(artist, track): urlToGet = echonestURL.format(artist, track) jsonString = read_url(urlToGet) trackJson = json.loads(jsonString) try: trackSummary = trackJson['response']['songs'][0]['audio_summary'] except: return None lyric_analyser = LyricAnalyser() enTrack = Track( artist, track, trackSummary['energy'], trackSummary['key'], trackSummary['loudness'], trackSummary['mode'], trackSummary['tempo'], trackSummary['danceability'], lyric_analyser) return enTrack
def read_page(self): self.htmlText = read_url(self.url) #print "Got html: %s" % (self.htmlText) is_downloaded = True (callSign, self.path) = parse_folder_url(self.url) self.folder_title = self.path[-1:]
def test_read_url_none(self): response = read_url(None) self.assertEqual("", response)
def test_read_url(self): response = read_url("http://ordergroove.com/company") self.assertNotEqual("", response)
def proxy_test(url_loc): proxies = utils.get_random_ip(proxies_pool) print("Use proxy:", proxies) while True: try: status_code = requests.get(url_loc, headers=utils.headers, proxies=proxies, verify=False).status_code if status_code != 200: print(status_code) proxies = utils.get_random_ip(proxies_pool) continue break except Exception: print("cannot connect proxy... keep trying...") proxies = utils.get_random_ip(proxies_pool) return proxies if __name__ == '__main__': urls = utils.read_url("input.csv") for url in urls: print("Processing", url) results, proxies = process_url(url, proxies) try: utils.write_url(sys.argv[1], results, fieldnames) except IOError: print("Fail to open file...")