Ejemplo n.º 1
0
def main():
    urls = utils.read_url("input.csv")
    i = 1
    for url in urls:
        print("Processing", url)
        utils.write_url("output.csv", process_url(url), fieldnames)
        i += 1
Ejemplo n.º 2
0
 def _gather_http(self):
     """get internal nodes information via http ot https, delete the first line"""
     url = utils.apply_template(self.provider, self.values)
     result = utils.read_url(url)
     if result:
         del result[0]
     result = ''.join(result)
     return self._return(yaml.load(result))
Ejemplo n.º 3
0
 def _gather_http(self):
     """get internal nodes information via http ot https, delete the first line"""
     url = utils.apply_template(self.provider, self.values)
     result = utils.read_url(url)
     if result:
         del result[0]
     result = ''.join(result)
     return self._return(yaml.load(result))
Ejemplo n.º 4
0
def get_recommended_tbb_version():
    """Get the recommended TBB version from RecommendedTBBVersions file."""
    tbb_versions_url = "https://www.torproject.org/projects/torbrowser/RecommendedTBBVersions"  # noqa
    versions = ut.read_url(tbb_versions_url)
    for line in versions.split():
        if "Linux" in line:
            return line.split("-")[0].lstrip('"')
    raise TBBGetRecommendedVersionError()
Ejemplo n.º 5
0
def get_recommended_tbb_version():
    """Get the recommended TBB version from RecommendedTBBVersions file."""
    tbb_versions_url = "https://www.torproject.org/projects/torbrowser/RecommendedTBBVersions"  # noqa
    versions = ut.read_url(tbb_versions_url)
    for line in versions.split():
        if "Linux" in line:
            return line.split("-")[0].lstrip('"')
    raise cm.TBBGetRecommendedVersionError()
Ejemplo n.º 6
0
def extract_text(url):
    prefix = 'https://commoncrawl.s3.amazonaws.com/'

    with utils.opengz(utils.read_url(url)) as f:
        for i, line in enumerate(f):
            url = f'{prefix}{line.strip()}'
            print(f'Processing {url}')
            for text in read_wet(url):
                yield text
Ejemplo n.º 7
0
    def __get_lyrics(self, artist, track):

        while (time.time() - self.last_accessed) < 20:
            time.sleep(1)
        self.last_accessed = time.time()

        searchUrl = "http://api.chartlyrics.com/apiv1.asmx/SearchLyricDirect?artist={}&song={}"
        searchUrl = searchUrl.format(artist, track)

        xml = minidom.parseString(read_url(searchUrl))
        res = xml.getElementsByTagName("GetLyricResult")[0]
        lyrics = res.getElementsByTagName("Lyric")[0].childNodes[0].data

        return str(lyrics)
Ejemplo n.º 8
0
def read_wet(url, skip=2):
    text = []
    record = False

    with utils.opengz(utils.read_url(url)) as f:
        for line in f:
            if line.startswith('Content-Length:'):
                record = True
                continue
            elif line.startswith('WARC/1.0'):
                record = False
                if skip <= 0:
                    yield ''.join(text).strip()
                skip -= 1
                text = []

            if record == True:
                text.append(line)
def scrape_and_collect(url):
    logger.info("started scrapping : {}".format(url))

    # read url and get response body
    html = read_url(url=url)
    logger.debug("page content : \n {}".format(html))

    # check if response is not blank, if blank then no need to process further
    if html == "":
        logger.error("empty response from URL, please provide valid URL")
        return dict()

    # initialize HtmlParser object
    parser = CustomHtmlParser()
    # feed html and collect elements
    parser.feed_and_collect(html=html)

    # set result as dictionary
    result = dict()
    result["total_elements"] = parser.total_elements
    result["top_elements"] = parser.top_elements(number=5)

    return result
Ejemplo n.º 10
0
def get_echonest_track(artist, track):
    urlToGet = echonestURL.format(artist, track)
    jsonString = read_url(urlToGet)
    trackJson = json.loads(jsonString)

    try:
        trackSummary = trackJson['response']['songs'][0]['audio_summary']
    except:
        return None

    lyric_analyser = LyricAnalyser()

    enTrack = Track(
        artist,
        track,
        trackSummary['energy'],
        trackSummary['key'],
        trackSummary['loudness'],
        trackSummary['mode'],
        trackSummary['tempo'],
        trackSummary['danceability'],
        lyric_analyser)

    return enTrack
Ejemplo n.º 11
0
 def read_page(self):
     self.htmlText = read_url(self.url)
     #print "Got html: %s" % (self.htmlText)
     is_downloaded = True
     (callSign, self.path) = parse_folder_url(self.url)
     self.folder_title = self.path[-1:]
 def test_read_url_none(self):
     response = read_url(None)
     self.assertEqual("", response)
 def test_read_url(self):
     response = read_url("http://ordergroove.com/company")
     self.assertNotEqual("", response)
Ejemplo n.º 14
0
def proxy_test(url_loc):
    proxies = utils.get_random_ip(proxies_pool)
    print("Use proxy:", proxies)
    while True:
        try:
            status_code = requests.get(url_loc,
                                       headers=utils.headers,
                                       proxies=proxies,
                                       verify=False).status_code
            if status_code != 200:
                print(status_code)
                proxies = utils.get_random_ip(proxies_pool)
                continue
            break
        except Exception:
            print("cannot connect proxy... keep trying...")
            proxies = utils.get_random_ip(proxies_pool)

    return proxies


if __name__ == '__main__':
    urls = utils.read_url("input.csv")
    for url in urls:
        print("Processing", url)
        results, proxies = process_url(url, proxies)
        try:
            utils.write_url(sys.argv[1], results, fieldnames)
        except IOError:
            print("Fail to open file...")