Beispiel #1
0
def search_by_query(query, cluster=-1, result=20, limit=100000):
    """ return *result* site based on search query from *cluster*, sorted according to rank  """

    embd = average_word_embedding(query)
    sites = sorted([(norm(row[1] - embd), row[0],
                     row[3] if row[3] != -1 else DB_DEFAULT_RANK)
                    for row in site_info_by_cluster(cluster, limit=limit)],
                   key=lambda x: x[0])
    return sorted([{
        'url': v[1],
        'rank': v[2]
    } for v in sites[:result]],
                  key=lambda x: x['rank'])
Beispiel #2
0
def get_similar_sites(embd, cluster=-1, top=10, improve=True, limit=11):
    """ return *top* sites similar to site having embedding *embd* 
      if *improve* = True it will also look for *limit* neighbour of cluster 
      Note -:  Do not change default value of *limit* until necessary """

    if cluster != -1 and improve:
        temp = list(neigh[cluster][:limit])
        temp.append(cluster)
        cluster = temp

    sim = sorted([(norm(row[1] - embd), row[0])
                  for row in globaldata.site_info_by_cluster(cluster)],
                 key=lambda x: x[0])
    return [v[1] for v in sim[:top]]
Beispiel #3
0
def search_by_domain(query, cluster=-1, results=50, limit=100000):
    """ return *result* site based on search domain from *cluster*, sorted according to rank  """

    domains = []
    for row in site_info_by_cluster(cluster, limit=limit):
        name = urlparse(row[0]).netloc
        for word in query:
            if word in name:
                domains.append({
                    'url':
                    row[0],
                    'rank':
                    row[3] if row[3] != -1 else DB_DEFAULT_RANK
                })
    domains.sort(key=lambda x: x['rank'])
    return domains[0:results]
Beispiel #4
0
def getClusterInfo(cluster_no):
    try:
        cluster_no -= 1  # beacuse from front end it consider clusterno (1-100)but in db(0-99)
        keywords = keywords_by_cluster(cluster_no)
        centroids = kmeans.cluster_centers_
        sites = sorted([(norm(row[1] - centroids[cluster_no - 1]), row[0],
                         row[3] if row[3] != -1 else DB_DEFAULT_RANK)
                        for row in site_info_by_cluster(cluster_no)],
                       key=lambda x: x[0])
        final = sorted([{
            'url': v[1],
            'rank': v[2]
        } for v in sites[:10]],
                       key=lambda x: x['rank'])
        only_urls = [dict['url'] for dict in final]
        return {'keywords': keywords, 'urls': only_urls}
    except sqlite3.Error as error:
        print('error fetching data from site_info', error)
        return json.dumps([])