Example #1
0
    def calculate_similarity(self, query_file, data_file, filename, k=None):
        """Calculate the similarity between a query file and a data file.

    The results are written to a file named 'filename'."""

        queries_set = doc.DocumentSet(query_file)
        documents_set = doc.DocumentSet(data_file)

        results = []
        for query in queries_set.documents:
            results.extend(self._tfidf(query, documents_set))

        output.write_output_file(filename, results)
Example #2
0
  def calculate_similarity(self, query_file, data_file, filename, k=None):
    """Calculate the similarity between a query file and a data file.

    The results are written to a file named 'filename'."""

    queries_set = doc.DocumentSet(query_file)
    documents_set = doc.DocumentSet(data_file)

    results = []
    for query in queries_set.documents:
      results.extend(self._tfidf(query, documents_set))

    output.write_output_file(filename, results)
Example #3
0
def main():
  """Calculates the basic word overlap similarity between qrys.txt & docs.txt.

  The results are written out to the file 'overlap.top'."""

  query_file = 'data/qrys.txt'
  data_file = 'data/docs.txt'
  
  queries_set = doc.DocumentSet(query_file)
  documents_set = doc.DocumentSet(data_file)

  results = []
  for query in queries_set.documents:
    results.extend(_calculate_overlap(query, documents_set.inverted_index))

  # Output the overlaps.
  output.write_output_file('overlap.top', results)
Example #4
0
def start_program():
    account = input('Введите id аккаунта или ник ')
    try:
        vk.params['user_id'] = int(account)
    except ValueError:
        print("введен ник")
        vk.params['screen_name'] = account
    user_info = vk.get_user_info()
    print(user_info['response'][0]['id'])
    user_id = user_info['response'][0]['id']
    users = vk.search_users(user_info)
    result = vk.compare_friends_groups(users)
    top10_users = s.find_top10(result)
    top10_users_with_photos = vk.find_top3_photos(top10_users)
    output = o.create_output_file(top10_users_with_photos)
    db.write_db_output(user_id, output)
    o.write_output_file(output)
    print("Программа завершена")
Example #5
0
  def calculate_similarity(self, query_file, data_file, filename):
    """Calculate the similarity between a query file and a data file.

    The results are written to a file named "filename"."""

    queries_set = doc.DocumentSet(query_file)
    documents_set = doc.DocumentSet(data_file)

    results = []
    for query in queries_set.documents:
      # Compute the initial tfidfs.
      initial_tfidfs = self.tf_idf._tfidf(query, documents_set)

      # Select the top n_d scoring documents.
      initial_tfidfs = sorted([(-s, d) for (_, d, s) in initial_tfidfs])
      initial_tfidfs = [(d, -s) for (s, d) in initial_tfidfs[:self.n_d]]
      selected_docs = [document for (document, _) in initial_tfidfs]

      # Combine the top documents into a 'mega document'.
      summed_counter = counter.Counter(query.words_counter)
      for document in selected_docs:
        summed_counter += document.words_counter
      mega_document = doc.document_from_dict(None, dict(summed_counter))

      # Select the top n_w scoring words (via tf.idf) from the megadocument.
      word_scores = []
      for word in sorted(list(mega_document.words_counter)):
        score = self.tf_idf._document_tfidf(word, mega_document, documents_set)
        word_scores.append((-score, word))
      word_scores = sorted(word_scores)[:self.n_w]
      word_scores = [(word, -score) for (score, word) in word_scores]

      # Use these new words as the next query, and return the tf.idf scores.
      new_query = doc.document_from_dict(query.id, dict(word_scores))
      results.extend(self.tf_idf._tfidf(new_query, documents_set))

    output.write_output_file(filename, results)