コード例 #1
0
async def asyncFetchAll(csv_in, csv_out):
    """
        INPUT: csv_src_file & csv_dest_file(to be written)
        NOTE: 
            * Semaphore limit is: 500
            * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time
    """

    tasks = []
    sem = asyncio.Semaphore(5)
    """ Initialize the output file """
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_out, headers)

    connector = TCPConnector(limit=10)
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        with open(csv_in, mode='r') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            global ENTRIES_TO_BE_WRITTEN
            for row in csv_reader:
                ENTRIES_TO_BE_WRITTEN += 1
                if (len(row["Content"]) != 0):
                    pc.printWarn(
                        "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                        .format(row["ID"], row["SourceSite"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    row["WeightedContent"] = row["Title"] + row[
                        "WeightedContent"]
                    row["Content"] = row["Content"]
                    await write_result(csv_out, row)
                    global WRITTEN_ENTRIES_ASYNC_DIRECT
                    WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                    pc.printMsg(
                        " \t\t ==============  Done Writing into csv for <ID = {}><src= {} >=============== "
                        .format(row["ID"], row["SourceSite"]))
                elif (row["Url"] and row["Title"]):
                    task = asyncio.ensure_future(
                        semaphoreSafeFetch(sem, row, session, csv_out))
                    tasks.append(task)

        responses = await asyncio.gather(*tasks)
        pc.printMsg(
            "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ Total items to actually scrape(found w/o Content) = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
            .format(len(responses)))
コード例 #2
0
async def asyncFetchAll(csv_in, csv_out):
    """
        INPUT: csv_src_file & csv_dest_file(to be written)
        NOTE: 
            * Semaphore limit is: 500
            * While writing the response to csv_dest_file, it is done in chunks of `N` entries at a time
    """

    tasks = []
    sem = asyncio.Semaphore(1000)
    """ Initialize the output file """
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_out, headers)

    connector = TCPConnector(limit=0)
    async with ClientSession(headers={'Connection': 'keep-alive'},
                             connector=connector) as session:
        with open(csv_in, mode='r') as csvfile:
            csv_reader = csv.DictReader(csvfile)
            line_count = 0
            global ENTRIES_TO_BE_WRITTEN
            for row in csv_reader:
                ENTRIES_TO_BE_WRITTEN += 1
                if (len(row["Content"]) != 0):
                    pc.printWarn(
                        "\t <ID = {}><src= {} > [NO SCRAPING] Content already exists............... NOW: {}"
                        .format(row["ID"], row["SourceSite"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    row["WeightedContent"] = text_actions.clean_text(
                        row["Title"] +
                        row["WeightedContent"]) + text_actions.getUrlString(
                            row["Content"])
                    row["Content"] = text_actions.clean_text(
                        row["Content"]) + text_actions.getUrlString(
                            row["Content"])
                    entry = [
                        row["ID"],
                        row["SourceSite"],
                        row["ProcessingDate"],
                        row["ProcessingEpoch"],
                        row["CreationDate"],
                        row["Title"],
                        row["Url"],
                        row["SourceTags"],
                        row["ModelTags"],
                        row["NumUpvotes"],
                        row["NumComments"],
                        row["PopI"],
                        row["Content"],
                        row["WeightedContent"],
                    ]
                    csv_functions.putToCsv(csv_out, entry)
                    global WRITTEN_ENTRIES_ASYNC_DIRECT
                    WRITTEN_ENTRIES_ASYNC_DIRECT += 1
                    pc.printMsg(
                        " \t\t ============== Done Writing into csv for <ID = {}><src= {} >=============== "
                        .format(row["ID"], row["SourceSite"]))
                elif (row["Url"] and row["Title"]):
                    task = asyncio.ensure_future(
                        semaphoreSafeFetch(sem, row, session))
                    tasks.append(task)

        responses = await asyncio.gather(*tasks)

        pc.printMsg(
            "\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ len(responses):: to be scraped = {} @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n"
            .format(len(responses)))

        for row in responses:
            if row["Content"] or row["Title"]:
                if len(
                        row["Content"]
                ) == 0:  # that means url was hit successfully and content was generated
                    row["Content"] = row["Title"]
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    row["Content"],
                    row["WeightedContent"],
                ]
                await write_result(csv_out, entry)
                # csv_functions.putToCsv(csv_out, entry)
                global WRITTEN_ENTRIES_ASYNC_SCRAPED
                WRITTEN_ENTRIES_ASYNC_SCRAPED += 1
                pc.printMsg(
                    " \t\t ============== Done Writing into csv for <ID = {}><src= {} > =============== "
                    .format(row["ID"], row["SourceSite"]))
            else:
                pc.printErr(
                    "\t\t xxxxxxxxxxxxxxxxxxx Skipping  for <ID = {}><src= {} > As No Content & Title xxxxxxxxxxxxxxxxxxxxxxxx\n"
                    .format(row["ID"], row["SourceSite"]))
コード例 #3
0
def RunSync(ts):
    """
        Pick wc-db's table mapped with `ts` and scrapes (useful) "clean" Content & WeightedContent from url.
        * NOTE:
            * If conent is already present in the table, "clean" it too & append the newly scraped content to it.
            * FIRST RUN: time = 17 hours, data = 12 MB, #entries = 6.5k
        Input: ts (format: 1598692058.887741)
    """
    pc.printMsg(
        '@[{}] >>>>>> Started Content-scraper(SYNC) ................... => FILENAME: {}\n'
        .format(datetime.fromtimestamp(ts),
                'dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))

    csv_src_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '.csv'
    csv_dest_file = '/Users/aayush.chaturvedi/Sandbox/cynicalReader/dbs/wc-db/wc_table_' + str(
        int(ts)) + '_wc_sync.csv'
    index = 1
    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_functions.creteCsvFile(csv_dest_file, headers)

    f = csv.writer(open(csv_dest_file, "w"))  # Flush the old file
    f.writerow([
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ])
    with open(csv_src_file, mode='r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        line_count = 0
        for row in csv_reader:
            if line_count == 0:
                print(f'Headers are {", ".join(row)}')
                line_count += 1
            #CHECK1(pre scraping): if (content != NULL) => no scraping, just put it in as is
            if (len(row["Content"]) != 0):
                pc.printWarn(
                    "\t <ID = {} > [NO SCRAPING] Content already exists....putting as it is............. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    text_actions.clean_text(row["Title"] +
                                            row["WeightedContent"]) +
                    text_actions.getUrlString(
                        row["Content"]),  #add the url-words too
                    text_actions.clean_text(row["Content"]) +
                    text_actions.getUrlString(row["Content"])
                ]
                global WRITTEN_ENTRIES_SYNC
                WRITTEN_ENTRIES_SYNC += 1
                f = csv.writer(open(csv_dest_file, "a"))
                f.writerow(entry)
            #CHECK2(pre scraping): if(url == NULL)=>discard
            #CHECK3(pre scraping): if (row["title"]==NULL)=>discard
            elif ((len(row["Url"]) != 0) and (len(row["Title"]) != 0)):
                pc.printWarn(
                    "\t <ID = {} > [SCRAPING BEGIN] sleeping for 0.0001 second ZZZZZZzzzzzzzzzzzz................. NOW: {}"
                    .format(row["ID"],
                            time.strftime("%H:%M:%S", time.localtime())))
                time.sleep(0.0001)
                try:
                    # response = web_requests.hitGetWithRetry(url,TIMEOUT=10)
                    response = web_requests.hitGetWithRetry(
                        row["Url"], '', False, 2, 0.5, 60)
                    # if response.status_code == 200:
                    if response != -1:
                        # content = text_actions.contentfromhtml(response)  #NOTE: for sync
                        content = text_actions.contentfromhtml(
                            response.text)  #NOTE: for Async
                        urlstrings = text_actions.getUrlString(content)
                        content += urlstrings  #add the url-words too
                        # weightedcontent = text_actions.weightedcontentfromhtml(response.text) + row["Title"] + urlstrings #add the url-words too      #NOTE: for sync
                        weightedcontent = text_actions.weightedcontentfromhtml(
                            response.text
                        ) + row[
                            "Title"] + urlstrings  #add the url-words too        #NOTE: for async
                        line_count += 1
                        #CHECK1(post scraping): if (content == null)&&(row["Title"] != null)<already checked abouve>=> row["Content"] = clean_text(row["title"]) AND row["weightedContent"] = clean_text(row["title"])
                        if (len(content) == 0):
                            content = row["Title"]
                            weightedcontent = row["Title"]
                        else:
                            entry = [
                                row["ID"], row["SourceSite"],
                                row["ProcessingDate"], row["ProcessingEpoch"],
                                row["CreationDate"], row["Title"], row["Url"],
                                row["SourceTags"], row["ModelTags"],
                                row["NumUpvotes"], row["NumComments"],
                                row["PopI"],
                                text_actions.clean_text(weightedcontent),
                                text_actions.clean_text(content)
                            ]

                        f = csv.writer(open(csv_dest_file, "a"))
                        f.writerow(entry)
                        pc.printMsg(
                            "\t\t <ID = {} > ============== Scraping Done....... \t NOW: {}"
                            .format(
                                row["ID"],
                                time.strftime("%H:%M:%S", time.localtime())))
                    else:
                        global SKIPPED_SYNC
                        SKIPPED_SYNC += 1
                        pc.printErr(
                            "\t\txxxxx SKIPPING... for ID: {} Unable to hit url: {} , "
                            .format(row["ID"], row["Url"]))
                except Exception as e:
                    global FAILED_SYNC
                    FAILED_SYNC += 1
                    pc.printErr(
                        "\t======= XXXXXXXX ERROR XXXXXX ======>> ID= {} NOW = {} Skipping...Failed due to: \n \t\t ERROR {}"
                        .format(row["ID"],
                                time.strftime("%H:%M:%S", time.localtime()),
                                e))
                    pass
    pc.printMsg(
        "\n****************** Content Scraping is Complete , FILENAME: {} ********************\n"
        .format('dbs/wc-db/wc_table_' + str(int(ts)) + '_wc.csv'))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
    pc.printMsg(
        "|\tWRITTEN_ENTRIES_SYNC \t  | \t {} \t|".format(WRITTEN_ENTRIES_SYNC))
    pc.printMsg("|\tSKIPPED_SYNC          \t | \t {} \t|".format(SKIPPED_SYNC))
    pc.printMsg("|\tFAILED_SYNC           \t | \t {} \t|".format(FAILED_SYNC))
    pc.printMsg(
        "\n----------------------------------------------------------------------------------\n"
    )
コード例 #4
0
def cleanNcheckAsyncOutput(csv_in, csv_out):
    """
        Analyse the created & input fles 
        Also, cleans Content & WeightedContent-> put in new file, delete the old one
        Variables:
            * NO_LINES_IN_INPUT_CSV
            * NO_LINES_IN_OUTPUT_CSV
            * NO_LINES_IN_OUTPUT_WITHOUT_TITLE
            * NO_LINES_IN_OUTPUT_WITHOUT_URL
            * NO_LINES_IN_OUTPUT_WITHOUT_CONTENT
    """

    f = open(csv_in, "r+")
    f.fseek(0)  # reach to first line
    reader = csv.reader(f)
    NO_LINES_IN_INPUT_CSV = len(list(reader))
    """ Now check and create new "cleaned" file """

    headers = [
        'ID', 'SourceSite', 'ProcessingDate', 'ProcessingEpoch',
        'CreationDate', 'Title', 'Url', 'SourceTags', 'ModelTags',
        'NumUpvotes', 'NumComments', 'PopI', 'WeightedContent', 'Content'
    ]
    csv_final_out = os.path.join("F", csv_out)
    csv_functions.creteCsvFile(csv_final_out, headers)

    pc.prCyan(
        " ========================== NOW CREATING FINAL OUTPUT FILE: {} ==========================="
        .format(csv_final_out))

    line_count = 0
    with open(csv_out, mode='r') as r, open(csv_final_out, 'a+',
                                            newline='') as f:
        reader = csv.DictReader(r)
        writer = csv.writer(f)
        NO_LINES_IN_OUTPUT_CSV = 0
        for row in reader:
            if (line_count == 0):  # skipping headers
                line_count += 1
            else:
                url_string_content = text_actions.getUrlString(row["Content"])
                content = text_actions.clean_text(row["Content"])
                weighted_content = text_actions.clean_text(
                    row["WeightedContent"])
                entry = [
                    row["ID"],
                    row["SourceSite"],
                    row["ProcessingDate"],
                    row["ProcessingEpoch"],
                    row["CreationDate"],
                    row["Title"],
                    row["Url"],
                    row["SourceTags"],
                    row["ModelTags"],
                    row["NumUpvotes"],
                    row["NumComments"],
                    row["PopI"],
                    weighted_content + url_string_content,
                    content,
                ]
                writer.writerow(entry)
                NO_LINES_IN_OUTPUT_CSV += 1
                if (len(row["Title"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_TITLE += 1
                if (len(row["Url"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_URL += 1
                if (len(row["Content"]) == 0):
                    NO_LINES_IN_OUTPUT_WITHOUT_CONTENT += 1

    #TODO:  os.remove(csv_in) %% rename

    pc.printWarn(
        "\n\n~~~~~~~~~~~~~~~~~~~~~~~~~~~ Analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    )
    pc.printWarn(
        "|\t NO_LINES_IN_INPUT_CSV                 \t | \t  {}  \t|".format(
            NO_LINES_IN_INPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_CSV                \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_CSV))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_TITLE      \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_TITLE))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_URL        \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_URL))
    pc.printWarn(
        "|\t NO_LINES_IN_OUTPUT_WITHOUT_CONTENT    \t | \t  {}  \t|".format(
            NO_LINES_IN_OUTPUT_WITHOUT_CONTENT))