Example #1
0
class AggregationTestCase(unittest.TestCase):
    def setUp(self):
        search_term = 'q'
        parameters = {'h': 1000}
        self.dblp = PublApi(top_k=1000,
                            delay=5,
                            search_term=search_term,
                            **parameters)
        localdata_file = os.path.abspath(
            os.path.dirname(__file__) + os.path.sep +
            "../../") + '/csv_example/dblp_sample.csv'
        localdata = LocalData(localdata_file, 'csv', "key", ["title"],
                              ["title"])
        localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
        initQueries = utils.queryGene(localdata_query, 2)
        self.initQueries = initQueries

    def test_stra_stratified_estimator(self):
        aggregation.stratified_estimator(query_pool=self.initQueries,
                                         api=self.dblp,
                                         match_term=["info.title"],
                                         candidate_rate=0.2,
                                         query_num=100)
        assert True

    def test_sota_estimator(self):
        aggregation.sota_estimator(query_pool=self.initQueries,
                                   api=self.dblp,
                                   match_term=["info.title"],
                                   uniqueid="info.key",
                                   query_num=1)
        self.dblp.getSession().close()
        assert True
Example #2
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(delay=5, search_term=search_term, **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/example/dblp_10000.pkl'
     localdata = LocalData(localdata_file, 'pkl', "row['key']",
                           ["row['title']"], ["row['title']"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
Example #3
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(top_k=1000,
                         delay=5,
                         search_term=search_term,
                         **parameters)
     localdata_file = os.path.abspath(
         os.path.dirname(__file__) + os.path.sep +
         "../../") + '/csv_example/dblp_sample.csv'
     localdata = LocalData(localdata_file, 'csv', "key", ["title"],
                           ["title"])
     localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
     initQueries = utils.queryGene(localdata_query, 2)
     self.initQueries = initQueries
Example #4
0
class DblpPublapiTestCase(unittest.TestCase):
    def setUp(self):
        search_term = 'q'
        parameters = {'h': 1000}
        self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters)

    def tearDown(self):
        self.dblp = None

    def test_callApi(self):
        query = ['set', 'cover']
        params = self.dblp.getKwargs()
        params[self.dblp.getSearchTerm()] = '+'.join(query)
        hitList = self.dblp.callAPI(params=params)
        assert len(hitList) >= 900

    def test_callMulApi(self):
        queries = [['set', 'cover'], ['approximate', 'query']]
        hitList = self.dblp.callMulAPI(queries)
        self.dblp.getSession().close()
        assert len(hitList) >= 1200
Example #5
0
Parameter  Description                                                       Default  Example
q          The query string to search for, as described on a separate page.           ...?q=test+search
format     The result format of the search. Recognized values are "xml",     xml      ...?q=test&format=json
           "json", and "jsonp".
h          Maximum number of search results (hits) to return. For bandwidth  30       ...?q=test&h=100
           reasons, this number is capped at 1000.
f          The first hit in the numbered sequence of search results 
           (starting with 0) to return. In combination with the h parameter, 0        ...?q=test&h=100&f=300  
           this parameter can be used for pagination of search results.
c          Maximum number of completion terms (see below) to return. For     10       ...?q=test&c=0
           bandwidth reasons, this number is capped at 1000.
"""
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters)
"""
\dblp_sample.csv
 dblp_3881.csv
 dblp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
              enriched_file.pkl
              enriched_file.csv
"""
sample_file = 'dblp_sample.csv'
localdata_file = 'dblp_3881.csv'
result_dir = 'dblp_result'
sampledata = SampleData(sample_ratio=0.5,
                        samplepath=sample_file,
Example #6
0
from deeperlib.api.dblp.publapi import PublApi
from deeperlib.core import utils
from deeperlib.data_processing.local_data import LocalData
from deeperlib.estimator import aggregation

# ==== Sota-Estimator Dblp ====
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(delay=5, search_term=search_term, **parameters)
localdata_file = 'dblp_10000'
localdata = LocalData(localdata_file, 'pkl', "row['key']", ["row['title']"],
                      ["row['title']"])
localdata_ids, localdata_query, localdata_er = localdata.getlocalData()
initQueries = utils.queryGene(localdata_query, 2)
aggregation.sota_estimator(query_pool=initQueries,
                           api=dblp,
                           match_term=["row['info']['title']"],
                           uniqueid="row['info']['key']",
                           query_num=1)

# ==== Stratified-Estimator Dblp ====
dblp = PublApi(delay=5, search_term=search_term, **parameters)
aggregation.stratified_estimator(query_pool=initQueries,
                                 api=dblp,
                                 match_term=["row['info']['title']"],
                                 candidate_rate=0.2,
                                 query_num=100)
dblp.getSession().close()
Example #7
0
def Deeper_WEB(budget, api_msg, original_csv, local_match, hidden_match):
    typo_ids = []
    parser = Deeper_HTMLParser()
    for i in range(1, len(original_csv)):
        for j in range(1, len(original_csv[i])):
            if '</span>' in original_csv[i][j]:
                parser.feed(original_csv[i][j])
                original_csv[i][j] = parser.get_text()
                typo_ids.append(original_csv[i][0])
                break

    if 'dblp Publ API' in api_msg[0]:
        search_term = 'q'
        parameters = {'h': 1000}
        api = PublApi(top_k=1000,
                      delay=5,
                      search_term=search_term,
                      **parameters)
        sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv'
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='csv',
                                uniqueid="key",
                                querylist=["title"])
        hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"])

        if "info.key" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("info.key")],
                querylist=[local_match[hidden_match.index("info.title")]],
                matchlist=[local_match[hidden_match.index("info.title")]],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("info.title")]],
                matchlist=[local_match[hidden_match.index("info.title")]],
                data_raw=original_csv)
    elif 'aminer Publ API' in api_msg[0]:
        search_term = 'term'
        parameters = {'size': 100, 'sort': 'relevance'}
        api = AdvancedPublApi(top_k=300,
                              delay=5,
                              search_term=search_term,
                              **parameters)
        sample_file = settings.BASE_DIR + '/netdisk/dblp_sample.csv'
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='csv',
                                uniqueid="key",
                                querylist=["title"])
        hiddendata = HiddenData(uniqueid="id", matchlist=["title"])

        if "id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("id")],
                querylist=[local_match[hidden_match.index("title")]],
                matchlist=[local_match[hidden_match.index("title")]],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("title")]],
                matchlist=[local_match[hidden_match.index("title")]],
                data_raw=original_csv)
    elif 'yelp Search API' in api_msg[0]:
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 50, 'location': api_msg[1]}
        api = SearchApi(client_id=client_id,
                        client_secret=client_secret,
                        top_k=300,
                        delay=5,
                        search_term=search_term,
                        **parameters)
        sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR,
                                                         api_msg[1])
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='pkl',
                                uniqueid="business_id",
                                querylist=["name"])
        hiddendata = HiddenData(
            uniqueid="id", matchlist=["name", "location.display_address.*"])
        if "id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("id")],
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")], local_match[
                        hidden_match.index("location.display_address.*")]
                ],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")], local_match[
                        hidden_match.index("location.display_address.*")]
                ],
                data_raw=original_csv)
    elif 'google Place API' in api_msg[0]:
        search_term = 'query'
        parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'}
        api = TextSearchApi(location='in+' + api_msg[1],
                            top_k=60,
                            delay=5,
                            search_term=search_term,
                            **parameters)
        sample_file = "%s/netdisk/yelp_sample_%s.pkl" % (settings.BASE_DIR,
                                                         api_msg[1])
        sampledata = SampleData(sample_ratio=0.5,
                                samplepath=sample_file,
                                filetype='pkl',
                                uniqueid="business_id",
                                querylist=["name"])
        hiddendata = HiddenData(uniqueid="place_id",
                                matchlist=["name", "formatted_address"])
        if "place_id" in hidden_match:
            localdata = LocalData(
                uniqueid=local_match[hidden_match.index("place_id")],
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")],
                    local_match[hidden_match.index("formatted_address")]
                ],
                data_raw=original_csv)
        else:
            uniqueID = 'AutoID'
            original_csv[0].append(uniqueID)
            for i in range(1, len(original_csv)):
                original_csv[i].append(i)
            localdata = LocalData(
                uniqueid=uniqueID,
                querylist=[local_match[hidden_match.index("name")]],
                matchlist=[
                    local_match[hidden_match.index("name")],
                    local_match[hidden_match.index("formatted_address")]
                ],
                data_raw=original_csv)

    SmartCrawl(budget, api, sampledata, localdata, hiddendata)
    localdata_csv = localdata.getRawData()
    crawldata_csv = Json2csv(hiddendata.getMergeResult()).getCsvdata()

    result = {}
    result['smart_queries'] = hiddendata.getQueryList()
    result['record'] = []
    if 'header' in crawldata_csv:
        result['local_header'] = localdata_csv['header']
        result['hidden_header'] = crawldata_csv['header']
        try:
            matchpair = sorted(hiddendata.getMatchPair().items(),
                               key=lambda item: int(item[0]),
                               reverse=False)
        except (ValueError):
            matchpair = sorted(hiddendata.getMatchPair().items(),
                               key=lambda item: item[0],
                               reverse=False)
        for m in matchpair:
            temp_record = []
            local_id = m[0]
            temp_record.append(localdata_csv[local_id])
            for hidden_id in m[1]:
                temp_record.append(crawldata_csv[hidden_id])
            result['record'].append(temp_record)
    else:
        result['local_header'] = localdata_csv['header']

    if 'dblp Publ API' in api_msg[0]:
        search_term = 'q'
        parameters = {'h': 1}
        api = PublApi(top_k=1, delay=5, search_term=search_term, **parameters)
        hiddendata = HiddenData(uniqueid="info.key", matchlist=["info.title"])
    elif 'aminer Publ API' in api_msg[0]:
        search_term = 'term'
        parameters = {'size': 1, 'sort': 'relevance'}
        api = AdvancedPublApi(top_k=1,
                              delay=5,
                              search_term=search_term,
                              **parameters)
        hiddendata = HiddenData(uniqueid="id", matchlist=["title"])
    elif 'yelp Search API' in api_msg[0]:
        client_id = "QhqrWe9agsd0Ad6Gs0qgMQ"
        client_secret = "6WQWRMV8edOhaThyWgm96wAJkIzJ1pHOhm5N0AD20edrnzv0lwi3wfgZAFp0IqQ6WIc-pZki83kjpViwptlcsiV0-Ij3HI6AJxhOTE4jsjNOoZOHZI3823twg8yZWXYx"
        search_term = 'term'
        parameters = {'limit': 1, 'location': 'AZ'}
        api = SearchApi(client_id=client_id,
                        client_secret=client_secret,
                        top_k=1,
                        delay=5,
                        search_term=search_term,
                        **parameters)
        hiddendata = HiddenData(
            uniqueid="id", matchlist=["name", "location.display_address.*"])
    elif 'google Place API' in api_msg[0]:
        search_term = 'query'
        parameters = {'key': 'AIzaSyDhBJSPqHfcEkPGQGbH7l3eWyF_PhF10iw'}
        api = TextSearchApi(location='in+' + api_msg[1],
                            top_k=60,
                            delay=5,
                            search_term=search_term,
                            **parameters)
        hiddendata = HiddenData(uniqueid="place_id",
                                matchlist=["name", "formatted_address"])
    result['naive'] = NaiveCrawl(budget, api, localdata, hiddendata, typo_ids)
    result['naive_queries'] = hiddendata.getQueryList()
    return result
Example #8
0
 def setUp(self):
     search_term = 'q'
     parameters = {'h': 1000}
     self.dblp = PublApi(top_k=1000, delay=5, search_term=search_term, **parameters)
Example #9
0
Parameter  Description                                                       Default  Example
q          The query string to search for, as described on a separate page.           ...?q=test+search
format     The result format of the search. Recognized values are "xml",     xml      ...?q=test&format=json
           "json", and "jsonp".
h          Maximum number of search results (hits) to return. For bandwidth  30       ...?q=test&h=100
           reasons, this number is capped at 1000.
f          The first hit in the numbered sequence of search results 
           (starting with 0) to return. In combination with the h parameter, 0        ...?q=test&h=100&f=300  
           this parameter can be used for pagination of search results.
c          Maximum number of completion terms (see below) to return. For     10       ...?q=test&c=0
           bandwidth reasons, this number is capped at 1000.
"""
search_term = 'q'
parameters = {'h': 1000}
dblp = PublApi(delay=5, search_term=search_term, **parameters)

"""
\dblp_sample.pkl
 dblp_10000.pkl
 dblp_result\\result_file.pkl
              result_file.csv
              match_file.pkl
              match_file.csv
"""
sample_file = 'dblp_sample.pkl'
localdata_file = 'dblp_10000.pkl'
result_dir = 'dblp_result'
sampledata = SampleData(samplepath=sample_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"])
localdata = LocalData(localpath=localdata_file, filetype='pkl', uniqueid="row['key']", querylist=["row['title']"],
                      matchlist=["row['title']"])