Esempio n. 1
0
def fetch(website, filter, filtertype):

    start = time.time()
    client = IndexClient()

    dataframe = pd.read_csv(f"{fname}\\url\\{website}.csv")
    print(len(dataframe))
    print(len(df))
    dataframe["url"] = dataframe["url"].str.lower()
    my_list = filter.split(",")
    # all the filter parameter

    if filtertype == 'notrequired':
        filter = dataframe[~dataframe['url'].str.contains('|'.join(my_list))]

    elif filtertype == 'required':
        filter = dataframe[dataframe['url'].str.contains('|'.join(my_list))]

    else:
        filter = dataframe

    print("after filter", len(filter))
    try:
        client.results = filter.to_dict("records")
        client.download(threads=4)
        htmldf = pd.DataFrame(client.results)
    except:
        return ("empty")
    end = time.time()
    timetaken = end - start
    with open(f'{fname}\\timetaken\\{website}CCtime.txt', 'w') as f:
        f.write(f"time taken is {timetaken}")

    return (htmldf)
Esempio n. 2
0
def test_comcrawl(snapshot):
    client = IndexClient(["2019-51"], verbose=True)
    client.search("https://index.commoncrawl.org/*")

    assert len(client.results) == 3

    # filter out duplicates with pandas
    results_df = pd.DataFrame(client.results)
    sorted_results_df = results_df.sort_values(by="timestamp")
    filtered_results_df = (sorted_results_df.drop_duplicates("urlkey",
                                                             keep="last"))
    client.results = filtered_results_df.to_dict("records")

    assert len(client.results) == 2

    client.download()

    snapshot.assert_match(client.results[1])
Esempio n. 3
0
 def download_client(self, index):
     print('--------------------- Download Initiated ---------------------')
     client = IndexClient([index], verbose=False)
     self.client = client
from comcrawl import IndexClient
import pandas as pd

client = IndexClient()
#client.search("reddit.com/r/MachineLearning/*")
client.search("producthunt.com/posts/*")


client.results = (pd.DataFrame(client.results)
                  .sort_values(by="timestamp")
                  .drop_duplicates("urlkey", keep="last")
                  .to_dict("records"))

client.download()

pd.DataFrame(client.results).to_csv("prodhunt.csv")
Esempio n. 5
0
from comcrawl import IndexClient
import pandas as pd

client = IndexClient(["2019-51"])
client.search("bbc.com/news/*")

# client.results = (pd.DataFrame(client.results)
#                   .sort_values(by="timestamp")
#                   .drop_duplicates("urlkey", keep="last")
#                   .to_dict("records"))

client.download()

client.results

# pd.DataFrame(client.results).to_csv("results.csv")
Esempio n. 6
0
        print(response)
        if response.status_code == 200:
            records = response.content.splitlines()

            for record in records:
                record_list.append(json.loads(record))

            print(f"Added {len(records)} results")

    print(f"found a total of {len(record_list)} hits.")

    return record_list


#search_domain("cnn.com")

client = IndexClient()
print("searching")
client.search("cnn.com", threads=4)

print("sorting")
client.results = (pd.DataFrame(client.results).sort_values(
    by="timestamp").drop_duplicates("urlkey", keep="last").to_dict("records"))

print(len(client.results))
print("downloading")
client.download(threads=4)

pd.DataFrame(client.results).to_csv("results.csv")

#client.download(threads=2)
Esempio n. 7
0
import sys
from comcrawl import IndexClient
import pandas as pd

client = IndexClient([
    '2020-05', '2020-10', '2020-16', '2020-24', '2020-29', '2020-34', '2020-40'
])
client.results = []
urlss = [
    'reuters.com/*', 'cnn.com/*', 'nytimes.com/*', 'bbc.com/*', 'cepr.org/*',
    'economist.com/*'
]
for ul in urlss:
    client.results.append(client.search(ul, threads=6))

client.results = (pd.DataFrame(client.results).sort_values(
    by="timestamp").drop_duplicates("urlkey", keep="last").to_dict("records"))

client.results = [
    res for res in client.results
    if res['status'] == '200' and 'covid' in res['url'] and (
        'financ' in res['url'] or 'econ' in res['url'] or 'cost' in res['url']
        or 'job' in res['url'])
][:1000]
df = pd.DataFrame(client.results)
del df['urlkey']
del df['charset']
del df['digest']
del df['mime-detected']
del df['languages']
del df['mime']