Python clean_text Exemples, utilities.text_actions.clean_text Python Exemples

Exemple #1

0

Afficher le fichier

async def fetchWithRetry(row, session):
    status = 400
    retry_cnt = 3
    sleep_time = 10
    TIMEOUT = 60

    while retry_cnt > 0 and status != 200:
        async with session.get(row["Url"],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== Scraping Done....... \t NOW: {}"
                    .format(row["ID"], row["SourceSite"],
                            time.strftime("%H:%M:%S", time.localtime())))
                urlstrings = text_actions.getUrlString(row["Content"])
                row["WeightedContent"] = text_actions.clean_text(
                    text_actions.weightedcontentfromhtml(res) + row["Title"] +
                    urlstrings)
                row["Content"] = text_actions.clean_text(
                    text_actions.contentfromhtml(res) + urlstrings)
                if (len(row["Content"]) == 0):
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"])
                    row["Content"] = text_actions.clean_text(row["Title"])
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row["ID"],row["SourceSite"],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- Unable to hit URL(ERR_CODE={}): {}  Sleeping for {} Retries remaining = {} -------------x"
                    .format(status, row["Url"], sleep_time, retry_cnt))
                await asyncio.sleep(sleep_time)
    pc.printErr(
        "\t\txxxxx SKIPPING... for <ID = {}><src= {} > Unable to hit url: {} , "
        .format(row["ID"], row["SourceSite"], row["Url"]))
    global SKIPPED_ASYNC
    SKIPPED_ASYNC += 1
    return row

Exemple #2

0

Afficher le fichier

async def asyncFetchAll(csv_in, csv_out):
    """
        INPUT: csv_src_file & csv_dest_file(to be written)
        NOTE: 
            * Semaphore limit is: 500
            * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time
    """

    tasks = []
    sem = asyncio.Semaphore(1000)
    """ Initialize the output file """
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_out, headers)

    connector = TCPConnector(limit=0)
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        with open(csv_in, mode='r') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            line_count = 0
            global ENTRIES_TO_BE_WRITTEN
            for row in csv_reader:
                ENTRIES_TO_BE_WRITTEN += 1
                if (len(row["Content"]) != 0):
                    pc.printWarn(
                        "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                        .format(row["ID"], row["SourceSite"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"] +
                        row["WeightedContent"]) + text_actions.getUrlString(
                            row["Content"])
                    row["Content"] = text_actions.clean_text(
                        row["Content"]) + text_actions.getUrlString(
                            row["Content"])
                    entry = [
                        row["ID"],
                        row["SourceSite"],
                        row["ProcessingDate"],
                        row["ProcessingEpoch"],
                        row["CreationDate"],
                        row["Title"],
                        row["Url"],
                        row["SourceTags"],
                        row["ModelTags"],
                        row["NumUpvotes"],
                        row["NumComments"],
                        row["PopI"],
                        row["Content"],
                        row["WeightedContent"],
                    ]
                    csv_functions.putToCsv(csv_out, entry)
                    global WRITTEN_ENTRIES_ASYNC_DIRECT
                    WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                    pc.printMsg(
                        " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== "
                        .format(row["ID"], row["SourceSite"]))
                elif (row["Url"] and row["Title"]):
                    task = asyncio.ensure_future(
                        semaphoreSafeFetch(sem, row, session))
                    tasks.append(task)

        responses = await asyncio.gather(*tasks)

        pc.printMsg(
            "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
            .format(len(responses)))

        for row in responses:
            if row["Content"] or row["Title"]:
                if len(
                        row["Content"]
                ) == 0:  # that means url was hit successfully and content was generated
                    row["Content"] = row["Title"]
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    row["Content"],
                    row["WeightedContent"],
                ]
                await write_result(csv_out, entry)
                # csv_functions.putToCsv(csv_out, entry)
                global WRITTEN_ENTRIES_ASYNC_SCRAPED
                WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                pc.printMsg(
                    " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== "
                    .format(row["ID"], row["SourceSite"]))
            else:
                pc.printErr(
                    "\t\t xxxxxxxxxxxxxxxxxxx Skipping  for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                    .format(row["ID"], row["SourceSite"]))

Exemple #3

0

Afficher le fichier

def RunSync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url.
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wc_sync.csv'
    index = 1
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_dest_file, headers)

    f = csv.writer(open(csv_dest_file, "w"))  # Flush the old file
    f.writerow([
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ])
    with open(csv_src_file, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Headers are {", ".join(row)}')
                line_count += 1
            #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is
            if (len(row["Content"]) != 0):
                pc.printWarn(
                    "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    text_actions.clean_text(row["Title"] +
                                            row["WeightedContent"]) +
                    text_actions.getUrlString(
                        row["Content"]),  #add the url-words too
                    text_actions.clean_text(row["Content"]) +
                    text_actions.getUrlString(row["Content"])
                ]
                global WRITTEN_ENTRIES_SYNC
                WRITTEN_ENTRIES_SYNC += 1
                f = csv.writer(open(csv_dest_file, "a"))
                f.writerow(entry)
            #CHECK2(pre scraping): if(url == NULL)=>discard
            #CHECK3(pre scraping): if (row["title"]==NULL)=>discard
            elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)):
                pc.printWarn(
                    "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                time.sleep(0.0001)
                try:
                    # response = web_requests.hitGetWithRetry(url,TIMEOUT=10)
                    response = web_requests.hitGetWithRetry(
                        row["Url"], '', False, 2, 0.5, 60)
                    # if response.status_code == 200:
                    if response != -1:
                        # content = text_actions.contentfromhtml(response)  #NOTE: for sync
                        content = text_actions.contentfromhtml(
                            response.text)  #NOTE: for Async
                        urlstrings = text_actions.getUrlString(content)
                        content += urlstrings  #add the url-words too
                        # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too      #NOTE: for sync
                        weightedcontent = text_actions.weightedcontentfromhtml(
                            response.text
                        ) + row[
                            "Title"] + urlstrings  #add the url-words too        #NOTE: for async
                        line_count += 1
                        #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"])
                        if (len(content) == 0):
                            content = row["Title"]
                            weightedcontent = row["Title"]
                        else:
                            entry = [
                                row["ID"], row["SourceSite"],
                                row["ProcessingDate"], row["ProcessingEpoch"],
                                row["CreationDate"], row["Title"], row["Url"],
                                row["SourceTags"], row["ModelTags"],
                                row["NumUpvotes"], row["NumComments"],
                                row["PopI"],
                                text_actions.clean_text(weightedcontent),
                                text_actions.clean_text(content)
                            ]

                        f = csv.writer(open(csv_dest_file, "a"))
                        f.writerow(entry)
                        pc.printMsg(
                            "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}"
                            .format(
                                row["ID"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    else:
                        global SKIPPED_SYNC
                        SKIPPED_SYNC += 1
                        pc.printErr(
                            "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , "
                            .format(row["ID"], row["Url"]))
                except Exception as e:
                    global FAILED_SYNC
                    FAILED_SYNC += 1
                    pc.printErr(
                        "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                        .format(row["ID"],
                                time.strftime("%H:%M:%S", time.localtime()),
                                e))
                    pass
    pc.printMsg(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
    pc.printMsg(
        "|\tWRITTEN_ENTRIES_SYNC \t  | \t {} \t|".format(WRITTEN_ENTRIES_SYNC))
    pc.printMsg("|\tSKIPPED_SYNC          \t | \t {} \t|".format(SKIPPED_SYNC))
    pc.printMsg("|\tFAILED_SYNC           \t | \t {} \t|".format(FAILED_SYNC))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )

Exemple #4

0

Afficher le fichier

Fichier : content_scraper_failed_csv.py Projet : aayush4vedi/CynicalReader

def cleanNcheckAsyncOutput(csv_in, csv_out):
    """
        Analyse the created & input fles 
        Also, cleans Content & WeightedContent-> put in new file, delete the old one
        Variables:
            * NO_LINES_IN_INPUT_CSV
            * NO_LINES_IN_OUTPUT_CSV
            * NO_LINES_IN_OUTPUT_WITHOUT_TITLE
            * NO_LINES_IN_OUTPUT_WITHOUT_URL
            * NO_LINES_IN_OUTPUT_WITHOUT_CONTENT
    """

    f = open(csv_in, "r+")
    f.fseek(0)  # reach to first line
    reader = csv.reader(f)
    NO_LINES_IN_INPUT_CSV = len(list(reader))
    """ Now check and create new "cleaned" file """

    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_final_out = os.path.join("F", csv_out)
    csv_functions.creteCsvFile(csv_final_out, headers)

    pc.prCyan(
        " ========================== NOW CREATING FINAL OUTPUT FILE: {} ==========================="
        .format(csv_final_out))

    line_count = 0
    with open(csv_out, mode='r') as r, open(csv_final_out, 'a+',
                                            newline='') as f:
        reader = csv.DictReader(r)
        writer = csv.writer(f)
        NO_LINES_IN_OUTPUT_CSV = 0
        for row in reader:
            if (line_count == 0):  # skipping headers
                line_count += 1
            else:
                url_string_content = text_actions.getUrlString(row["Content"])
                content = text_actions.clean_text(row["Content"])
                weighted_content = text_actions.clean_text(
                    row["WeightedContent"])
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    weighted_content + url_string_content,
                    content,
                ]
                writer.writerow(entry)
                NO_LINES_IN_OUTPUT_CSV += 1
                if (len(row["Title"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_TITLE += 1
                if (len(row["Url"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_URL += 1
                if (len(row["Content"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_CONTENT += 1

    #TODO:  os.remove(csv_in) %% rename

    pc.printWarn(
        "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    )
    pc.printWarn(
        "|\t NO_LINES_IN_INPUT_CSV                 \t | \t  {}  \t|".format(
            NO_LINES_IN_INPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_CSV                \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_TITLE      \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_TITLE))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_URL        \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_URL))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_CONTENT    \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_CONTENT))

Exemple #5

0

Afficher le fichier

def run(ts):
    """
        Get top 1000 submissions of the listed subreddits (max_limit is 1000; should be enough)
        Hence no use of `ts` here
    """
    startTime = time.time()
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc(
        '@[{}] >>>>>> Started r-scraper ................... => TABLE: {}\n'.
        format(datetime.fromtimestamp(ts), wc_table))
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n"
    )
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()

    blob_pages = ['.jpg', '.png', '.gif', '.mp3',
                  '.mp4']  # these give blob data; no point in scraping them

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    # Setup Client
    reddit = praw.Reddit(
        client_id=vault.R_CLIENT_ID,  # PERSONAL_USE_SCRIPT_14_CHARS
        client_secret=vault.R_CLIENT_SECRET,  # SECRET_KEY_27_CHARS
        user_agent=vault.R_USER_AGENT,  # YOUR_APP_NAME
        username=vault.R_USERNAME,  # YOUR_REDDIT_USER_NAME
        password=vault.R_PASSWORD)  # YOUR_REDDIT_LOGIN_PASSWORD

    for subreddit, tag_arr in LIST.items():
        try:
            pc.printWarn(
                "\t ............  Subreddit@R_UrlScraping : {}  .............".
                format(subreddit))
            sr = reddit.subreddit(subreddit)
            # for submission in sr.top('day',limit=10):                   # For testing....
            # for submission in sr.top('year',limit=1000):                #remove this & uncomemnt below line
            ENTRIES_IN_THIS_SUBRDDIT = 0
            for submission in sr.top('week',
                                     limit=gw.R_ITEM_LIMIT_PER_SUBREDDIT
                                     ):  #NOTE: max limit is 1000
                #Check1: if the post is unlocked by mods
                content = ''
                """ Fixing permalink type urls """
                url = submission.url
                if (url[:2] == '/r'):
                    url = "https://www.reddit.com" + url
                if (submission.locked == False):
                    #Check2: if post is just an image, discard it
                    if submission.url[
                            -4:] not in blob_pages:  #as reddit currentluy hosts .png & .gif only
                        # if permalink is a substring of url OR submission is a selfpost (text-only) => no need to scrape
                        # NOTE: I know there might be links in post with some discription+link to other article he's reffering; but not worth wasting precious processing time
                        if ((submission.permalink in submission.url)
                                or (submission.is_self == True)):
                            content = submission.selftext
                        entry = [
                            index, "r/" + subreddit,
                            datetime.fromtimestamp(ts).date(),
                            int(ts),
                            date_conversion.RedditDate(
                                str(datetime.fromtimestamp(
                                    submission.created))), submission.title,
                            url,
                            json.dumps(tag_arr), '', submission.score,
                            submission.num_comments, '', '',
                            text_actions.clean_text(content)
                        ]
                        # csv_functions.putToCsv(csv_file,entry)
                        c.execute(
                            'INSERT INTO ' + wc_table +
                            ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                        index += 1
                        ENTRIES_IN_THIS_SUBRDDIT += 1
            gw.R_TOTAL_ITEMS_GOT_YET += ENTRIES_IN_THIS_SUBRDDIT
            pc.printMsg(
                "\t\t\t\t\t ====> ENTRIES_IN_THIS_SUBRDDIT = {} \t\t |  \t gw.R_TOTAL_ITEMS_GOT_YET = {}"
                .format(ENTRIES_IN_THIS_SUBRDDIT, gw.R_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(
                " \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n"
                .format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    gw.WC_TOTAL_URL_ENTRIES += gw.R_TOTAL_ITEMS_GOT_YET

    conn.commit()
    conn.close()
    pc.printMsg(
        "\t -------------------------------------- < r_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n"
    )
    pc.printSucc(
        "\n\n***************************** Reddit Url Scraping is Complete. TABLE: {} ******************"
        .format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post r URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.R_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row([
        'TIME TAKEN FOR URL SCRAPING-r (min) ',
        round((endTime - startTime) / 60, 2)
    ])
    pc.printSucc(table)
    print("\n\n")

Exemple #6

0

Afficher le fichier

def run(ts):
    """
        Scrapes Algolia's HN api for last 7 days & puts data in WC-DB.
            * max number of entries in algolia's single api call = 1000. So scrape for one day at a time
            * Link to documentation: https://hn.algolia.com/api
        Note:
            1. For AskHN entries put `` tag & separate threshold
            1. For ShowHN entries put `` tag & separate threshold
            1. For Jobs@HN entries put `` tag => later as these entries dont have upvotes/comments
        Input: ts (format: 1598692058.887741)
    """
    wc_db = 'dbs/wc.db'
    wc_table = 'wc_' + str(int(ts))
    pc.printSucc('@[{}] >>>>>> Started HN-scraper ................... => TABLE: {}\n'.format(datetime.fromtimestamp(ts),wc_table))
    conn = sqlite3.connect(wc_db, timeout=10)
    c = conn.cursor()
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Opened > ---------------------------------------------\n")
    startTime = time.time()

    """
        here is how you add day to `ts`:

        from datetime import datetime, timedelta
        newts = datetime.fromtimestamp(ts) + timedelta(days=1) # 2020-08-30 16:02:34.352094
        newts.timestamp() # 1598783633.284871
        datetime.fromtimestamp(ts) #2020-08-29 17:15:32
    """

    """ ts_arr has last 7 days(including today's) (non-decimal stype)timestamps strings 
        TIP: use `datetime.fromtimestamp(int(t))` to convert to human readable format
    """
    ts_arr = [str(int(ts))]

    for i in range(6):
        new_ts = datetime.fromtimestamp(int(ts_arr[-1])) + timedelta(days=-1)
        new_ts = new_ts.timestamp()
        ts_arr.append(str(int(new_ts)))

    # for t in ts_arr:
    #     print("timestamp: {} \t date: {}".format(t,datetime.fromtimestamp(int(t))))

    index = gw.WC_TOTAL_URL_ENTRIES + 1

    for i in range(len(ts_arr)-1):
        startepoch = ts_arr[i]
        endepoch   = ts_arr[i+1]
        pc.printMsg(" ................. scraping for interval: start= {} -> end = {} .................\n".format(startepoch,endepoch))
        
        """ 
            getting stories(articles) with upvotes_count > upvotes_threshold 
            Also including:
                1. TellHN (<tech_discuss>)
                2. LaunchHN (<startup>)
        """
        pc.printWarn(" \t............. scraping stories .............")
        try:
            url_story = 'http://hn.algolia.com/api/v1/search_by_date?tags=story&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_STORY_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_story)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                sourceTag = ''
                content = ''
                sourceSite = 'HN'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    # print( '------------------------- found null urled value ---------------------\n-----[STORY]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                    if("Launch HN:" in item["title"]):                                    # 1. LaunchHN
                        sourceTag = 'startup'
                        sourceSite += '/launch'
                    if("Tell HN:" in item["title"]):                                      # 2. TellHN
                        sourceTag = 'tech_discuss'
                        sourceSite += '/tell'
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    sourceTag,
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

        """ getting ShowHNs """
        pc.printWarn("\t............. scraping showHNs .............")
        try:
            url_show = 'http://hn.algolia.com/api/v1/search_by_date?tags=show_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_SHOWHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_show)
            res_size = json.loads(data.content)["nbHits"]

            pc.printMsg("\t\t\t\t====> Item count: {}".format(res_size))
            
            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]

            for item in items_arr:
                content = ''
                sourceSite = 'HN/show'
                if(item["url"] is None): #as all ShowHNs may not have an url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[SHOW]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'sideproj',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1

            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass


        """ getting AskHNs """

        pc.printWarn("\t............. scraping askHNs .............")
        try:
            url_ask = 'http://hn.algolia.com/api/v1/search_by_date?tags=ask_hn&hitsPerPage=9999&numericFilters=created_at_i>'+str(endepoch)+',created_at_i<'+ str(startepoch) + ',points>' + str(gw.HN_ASKHN_UPVOTE_TH)
            data = web_requests.hitGetWithRetry(url_ask)
            res_size = json.loads(data.content)["nbHits"]

            pc.printWarn("\t\t\t\t====> Item count: {}".format(res_size))

            gw.HN_TOTAL_ITEMS_GOT_YET += res_size
            items_arr = json.loads(data.content)["hits"]
            

            for item in items_arr:
                content = ''
                sourceSite = 'HN/ask'
                if(item["url"] is None): #as AskHNs dont have any url ...hihi...
                    url = 'https://news.ycombinator.com/item?id='+str(item["objectID"])
                    # print( '-------------------------- found null urled value ---------------------\n-----[ASK]url: {}'.format(url))
                    # print(json.dumps(item, indent = 4))
                    if(item["story_text"] is not None):
                        content = text_actions.getTextFromHtml(item["story_text"])
                else:
                    url = item["url"] 
                entry = [
                    index,
                    sourceSite,
                    datetime.fromtimestamp(ts).date(),
                    int(ts),
                    date_conversion.HNDate(str(item["created_at"])),
                    item["title"],              
                    url,
                    'prog_query',
                    '',
                    item["points"],
                    item["num_comments"],
                    '',
                    '',
                    text_actions.clean_text(content)
                    ]
                c.execute('INSERT INTO ' + wc_table + ' VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
                index=index+1
            pc.printMsg("\t\t\t ====>> gw.HN_TOTAL_ITEMS_GOT_YET = {}".format(gw.HN_TOTAL_ITEMS_GOT_YET))
        except Exception as e:
            pc.printErr(" \t xxxxxxxxxxxxx ERROR@r_UrlScraping xxxxxxxxxxxxxxxxxxxx >> [ID]= {} Skipping...Failed due to: {} \n".format(index, e))
            logging.error(traceback.format_exc())
            pass

    endTime = time.time()
    conn.commit()
    conn.close()
    gw.WC_TOTAL_URL_ENTRIES += gw.HN_TOTAL_ITEMS_GOT_YET
    pc.printMsg("\t -------------------------------------- < HN_SCRAPER: DB/wc Connection Closed > ---------------------------------------------\n")

    pc.printSucc("\n\n***************************** HN Url Scraping is Complete. TABLE: {} ******************".format(wc_table))
    print("\n\n")
    table = PrettyTable(['Entity (Post HN URL Scraping)', 'Value'])
    table.add_row(['TOTAL URLS FETCHED by HN', gw.HN_TOTAL_ITEMS_GOT_YET])
    table.add_row(['TOTAL ITEMS IN WC TABLE YET', gw.WC_TOTAL_URL_ENTRIES])
    table.add_row(['TIME TAKEN FOR URL SCRAPING-HN (sec) ', round((endTime - startTime),5)])
    pc.printSucc(table)
    print("\n\n")

Exemple #7

0

Afficher le fichier

Fichier : content_scraper_one_point_o.py Projet : aayush4vedi/CynicalReader

async def fetchWithRetry(row, session):
    """
        Hits ulr(with retires):
        * if status == 200:
            return resposne ((raw)Content & (raw)WeightedContent in row)
        * if still unable to hit after retries: Content = Title , WeightedContent = Title
        INPUT: `row` is an array with indices: 
            ID(0),SourceSite(1),ProcessingDate(2),ProcessingEpoch(3),CreationDate(4),Title(5),Url(6),
            SourceTags(7),ModelTags(8),NumUpvotes(9),NumComments(10),PopI(11),WeightedContent(12),Content(13)
    """

    status = 400
    retry_cnt = 2
    sleep_time = 5
    # TIMEOUT = ClientTimeout(total=20)
    TIMEOUT = 20

    while retry_cnt > 0 and status != 200:
        async with session.get(row[6],
                               ssl=ssl.create_default_context(
                                   purpose=ssl.Purpose.CLIENT_AUTH),
                               timeout=TIMEOUT) as response:
            res = await response.text()
            # res = await response.content.read()
            # res = await text_actions.clean_text(str(response.content.read()))
            res = text_actions.clean_text(str(res))
            # res = res.encode('utf8', 'ignore').decode('utf8', 'ignore')                   #FIXME: not working
            status = response.status
            if (status == 200 and len(res) != 0):
                pc.printSucc(
                    "\t\t <ID = {}><src= {} > ============== #Scraped ....... \t NOW: {}"
                    .format(row[0], row[1],
                            time.strftime("%H:%M:%S", time.localtime())))
                row_list = list(row)
                row_list[12] = text_actions.weightedcontentfromhtml(res)
                row_list[13] = text_actions.contentfromhtml(res)
                # for i in range(len(row_list)):
                #     row_list[i] = row_list[i].decode("utf-8", "ignore")

                row = tuple(row_list)
                # pc.printWarn("\t <ID = {}><src= {} > sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}".format(row[0],row[1],time.strftime("%H:%M:%S", time.localtime())))
                # time.sleep(0.001)
                if (len(row[13]) == 0):
                    global ERR_ASYNC_NO_CONTENT_IN_SCRAPING
                    ERR_ASYNC_NO_CONTENT_IN_SCRAPING += 1
                    pc.printErr(
                        "\t\t xxxxxxxxxxxxxxxxxxx SKIPPING  for <ID = {}><src= {} > As No Content even after scraping xxxxxxxxxxxxxxxxxxxxxxxx\n"
                        .format(row[0], row[1]))
                return row
            else:
                retry_cnt -= 1
                pc.printWarn(
                    "\t x---------------- <ID = {}><src= {} > Unable to hit URL(ERR_CODE={}): {}.........  Sleeping for {} Retries remaining = {} -------------x"
                    .format(row[0], row[1], status, row[6][:25], sleep_time,
                            retry_cnt))
                await asyncio.sleep(sleep_time)

    pc.printErr(
        "\t\txxxxx  For <ID = {}><src= {} >Totally unable to hit url.... using Title for Content & WeightedContent : {} "
        .format(row[0], row[1], row[6]))
    global ERR_ASYNC_ON_URL_ERROR
    ERR_ASYNC_ON_URL_ERROR += 1
    pc.printMsg(
        " \t\t\t ============== [Unreachable URL] Will write anyways. <ID = {}><src= {} > =============== "
        .format(row[0], row[1]))
    return row