class SampledataTestCase(unittest.TestCase): def setUp(self): sample_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/pkl_example/yelp_sample.pkl' self.sampledata = SampleData(0.5, sample_file, 'pkl', "id", ["name"]) def tearDown(self): self.sampledata = None def test_loadSample(self): self.sampledata.setSample(None) self.sampledata.read_pickle() sample = self.sampledata.getSample() assert len(sample) == 484
""" \dblp_sample.csv dblp_3881.csv dblp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'dblp_sample.csv' localdata_file = 'dblp_3881.csv' result_dir = 'dblp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) localdata = LocalData(localpath=localdata_file, filetype='csv', uniqueid="ID", querylist=['title'], matchlist=['title']) hiddendata = HiddenData(result_dir=result_dir, uniqueid="info.key", matchlist=["info.title"]) budget = 20 smartcrawl.smartCrawl(budget, dblp, sampledata, localdata, hiddendata) """ pool_thre = 2 jaccard_thre = 0.85
yelp = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) """ \yelp_sample_AZ.pkl yelp_3000_AZ.csv yelp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv enriched_file.pkl enriched_file.csv """ sample_file = 'yelp_sample_AZ.pkl' localdata_file = 'yelp_3000_AZ.csv' result_dir = 'yelp_result' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) localdata = LocalData(localpath=localdata_file, filetype='csv', uniqueid="business_id", querylist=["name"], matchlist=["name", "full_address"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="id", matchlist=["name", "location.display_address.*"]) budget = 20 smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata) """ pool_thre = 2 jaccard_thre = 0.85 threads = 4 smartcrawl.smartCrawl(budget, yelp, sampledata, localdata, hiddendata, pool_thre, jaccard_thre, threads) """
def setUp(self): sample_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/yelp_example/yelp_sample_AZ.pkl' self.sampledata = SampleData(0.5, sample_file, 'pkl', "business_id", ["name"])
def Deeper_WEB(budget, api_msg, original_csv, local_match, hidden_match): typo_ids = [] parser = Deeper_HTMLParser() for i in range(1, len(original_csv)): for j in range(1, len(original_csv[i])): if '</span>' in original_csv[i][j]: parser.feed(original_csv[i][j]) original_csv[i][j] = parser.get_text() typo_ids.append(original_csv[i][0]) break if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1000} api = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) if "info.key" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("info.key")], querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("info.title")]], matchlist=[local_match[hidden_match.index("info.title")]], data_raw=original_csv) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 100, 'sort': 'relevance'} api = AdvancedPublApi(top_k=300, delay=5, search_term=search_term, **parameters) sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv' sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='csv', uniqueid="key", querylist=["title"]) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("title")]], matchlist=[local_match[hidden_match.index("title")]], data_raw=original_csv) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 50, 'location': api_msg[1]} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=300, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) if "id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[ hidden_match.index("location.display_address.*")] ], data_raw=original_csv) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR, api_msg[1]) sampledata = SampleData(sample_ratio=0.5, samplepath=sample_file, filetype='pkl', uniqueid="business_id", querylist=["name"]) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) if "place_id" in hidden_match: localdata = LocalData( uniqueid=local_match[hidden_match.index("place_id")], querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) else: uniqueID = 'AutoID' original_csv[0].append(uniqueID) for i in range(1, len(original_csv)): original_csv[i].append(i) localdata = LocalData( uniqueid=uniqueID, querylist=[local_match[hidden_match.index("name")]], matchlist=[ local_match[hidden_match.index("name")], local_match[hidden_match.index("formatted_address")] ], data_raw=original_csv) SmartCrawl(budget, api, sampledata, localdata, hiddendata) localdata_csv = localdata.getRawData() crawldata_csv = Json2csv(hiddendata.getMergeResult()).getCsvdata() result = {} result['smart_queries'] = hiddendata.getQueryList() result['record'] = [] if 'header' in crawldata_csv: result['local_header'] = localdata_csv['header'] result['hidden_header'] = crawldata_csv['header'] try: matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: int(item[0]), reverse=False) except (ValueError): matchpair = sorted(hiddendata.getMatchPair().items(), key=lambda item: item[0], reverse=False) for m in matchpair: temp_record = [] local_id = m[0] temp_record.append(localdata_csv[local_id]) for hidden_id in m[1]: temp_record.append(crawldata_csv[hidden_id]) result['record'].append(temp_record) else: result['local_header'] = localdata_csv['header'] if 'dblp Publ API' in api_msg[0]: search_term = 'q' parameters = {'h': 1} api = PublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"]) elif 'aminer Publ API' in api_msg[0]: search_term = 'term' parameters = {'size': 1, 'sort': 'relevance'} api = AdvancedPublApi(top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="id", matchlist=["title"]) elif 'yelp Search API' in api_msg[0]: client_id = "QhqrWe9agsd0Ad6Gs0qgMQ" client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx" search_term = 'term' parameters = {'limit': 1, 'location': 'AZ'} api = SearchApi(client_id=client_id, client_secret=client_secret, top_k=1, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData( uniqueid="id", matchlist=["name", "location.display_address.*"]) elif 'google Place API' in api_msg[0]: search_term = 'query' parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'} api = TextSearchApi(location='in+' + api_msg[1], top_k=60, delay=5, search_term=search_term, **parameters) hiddendata = HiddenData(uniqueid="place_id", matchlist=["name", "formatted_address"]) result['naive'] = NaiveCrawl(budget, api, localdata, hiddendata, typo_ids) result['naive_queries'] = hiddendata.getQueryList() return result
format The result format of the search. Recognized values are "xml", xml ...?q=test&format=json "json", and "jsonp". h Maximum number of search results (hits) to return. For bandwidth 30 ...?q=test&h=100 reasons, this number is capped at 1000. f The first hit in the numbered sequence of search results (starting with 0) to return. In combination with the h parameter, 0 ...?q=test&h=100&f=300 this parameter can be used for pagination of search results. c Maximum number of completion terms (see below) to return. For 10 ...?q=test&c=0 bandwidth reasons, this number is capped at 1000. """ search_term = 'q' parameters = {'h': 1000} dblp = PublApi(delay=5, search_term=search_term, **parameters) """ \dblp_sample.pkl dblp_10000.pkl dblp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv """ sample_file = 'dblp_sample.pkl' localdata_file = 'dblp_10000.pkl' result_dir = 'dblp_result' sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"]) localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"], matchlist=["row['title']"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['info']['key']", matchlist=["row['info']['title']"]) smartcrawl.smartCrawl(top_k, count, pool_thre, jaccard_thre, threads, budget, dblp, sampledata, localdata, hiddendata)
delay=5, search_term=search_term, **parameters) """ \yelp_sample.pkl yelp_10000.pkl yelp_result\\result_file.pkl result_file.csv match_file.pkl match_file.csv """ sample_file = 'yelp_sample.pkl' localdata_file = 'yelp_3000.pkl' result_dir = 'yelp_result' sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['id']", querylist=["row['name']"]) """ localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['business_id']", querylist=["row['name']"], matchlist=["row['name']", "row['full_address']"]) hiddendata = HiddenData(result_dir=result_dir, uniqueid="row['id']", matchlist=["row['name']", "' '.join(row['location']['display_address'])"]) """ localdata = LocalData(localpath='yelp_5882.csv', filetype='csv', uniqueid='ID', querylist=['NAME'], matchlist=['NAME', 'ADDRESS']) hiddendata = HiddenData( result_dir=result_dir,
def setUp(self): sample_file = os.path.abspath( os.path.dirname(__file__) + os.path.sep + "../../") + '/example/yelp_sample.pkl' self.sampledata = SampleData(sample_file, 'pkl', "row['id']", ["row['name']"])