def fetch(website, filter, filtertype): start = time.time() client = IndexClient() dataframe = pd.read_csv(f"{fname}\\url\\{website}.csv") print(len(dataframe)) print(len(df)) dataframe["url"] = dataframe["url"].str.lower() my_list = filter.split(",") # all the filter parameter if filtertype == 'notrequired': filter = dataframe[~dataframe['url'].str.contains('|'.join(my_list))] elif filtertype == 'required': filter = dataframe[dataframe['url'].str.contains('|'.join(my_list))] else: filter = dataframe print("after filter", len(filter)) try: client.results = filter.to_dict("records") client.download(threads=4) htmldf = pd.DataFrame(client.results) except: return ("empty") end = time.time() timetaken = end - start with open(f'{fname}\\timetaken\\{website}CCtime.txt', 'w') as f: f.write(f"time taken is {timetaken}") return (htmldf)
def test_comcrawl(snapshot): client = IndexClient(["2019-51"], verbose=True) client.search("https://index.commoncrawl.org/*") assert len(client.results) == 3 # filter out duplicates with pandas results_df = pd.DataFrame(client.results) sorted_results_df = results_df.sort_values(by="timestamp") filtered_results_df = (sorted_results_df.drop_duplicates("urlkey", keep="last")) client.results = filtered_results_df.to_dict("records") assert len(client.results) == 2 client.download() snapshot.assert_match(client.results[1])
def download_client(self, index): print('--------------------- Download Initiated ---------------------') client = IndexClient([index], verbose=False) self.client = client
from comcrawl import IndexClient import pandas as pd client = IndexClient() #client.search("reddit.com/r/MachineLearning/*") client.search("producthunt.com/posts/*") client.results = (pd.DataFrame(client.results) .sort_values(by="timestamp") .drop_duplicates("urlkey", keep="last") .to_dict("records")) client.download() pd.DataFrame(client.results).to_csv("prodhunt.csv")
from comcrawl import IndexClient import pandas as pd client = IndexClient(["2019-51"]) client.search("bbc.com/news/*") # client.results = (pd.DataFrame(client.results) # .sort_values(by="timestamp") # .drop_duplicates("urlkey", keep="last") # .to_dict("records")) client.download() client.results # pd.DataFrame(client.results).to_csv("results.csv")
print(response) if response.status_code == 200: records = response.content.splitlines() for record in records: record_list.append(json.loads(record)) print(f"Added {len(records)} results") print(f"found a total of {len(record_list)} hits.") return record_list #search_domain("cnn.com") client = IndexClient() print("searching") client.search("cnn.com", threads=4) print("sorting") client.results = (pd.DataFrame(client.results).sort_values( by="timestamp").drop_duplicates("urlkey", keep="last").to_dict("records")) print(len(client.results)) print("downloading") client.download(threads=4) pd.DataFrame(client.results).to_csv("results.csv") #client.download(threads=2)
import sys from comcrawl import IndexClient import pandas as pd client = IndexClient([ '2020-05', '2020-10', '2020-16', '2020-24', '2020-29', '2020-34', '2020-40' ]) client.results = [] urlss = [ 'reuters.com/*', 'cnn.com/*', 'nytimes.com/*', 'bbc.com/*', 'cepr.org/*', 'economist.com/*' ] for ul in urlss: client.results.append(client.search(ul, threads=6)) client.results = (pd.DataFrame(client.results).sort_values( by="timestamp").drop_duplicates("urlkey", keep="last").to_dict("records")) client.results = [ res for res in client.results if res['status'] == '200' and 'covid' in res['url'] and ( 'financ' in res['url'] or 'econ' in res['url'] or 'cost' in res['url'] or 'job' in res['url']) ][:1000] df = pd.DataFrame(client.results) del df['urlkey'] del df['charset'] del df['digest'] del df['mime-detected'] del df['languages'] del df['mime']