Exemple #1
0
def get_dict_vectors_of_documents(documents, justTokens=False, scores=None, headers=None):
  init_doc_count = len(documents)/2
  operated_doc_count = 0
  doc_dict_vectors_list = []
  for i in range(len(documents)/2):
    if(operated_doc_count == 400):
      init_doc_count-=400
      operated_doc_count=0
      print str(init_doc_count) + " sets remaining"
    operated_doc_count+=1
    document1, document2 = documents[(2*i)], documents[(2*i)+1]
    sent_1_tokens = process.tokens(document1)
    sent_2_tokens = process.tokens(document2)
    # print sent_1_tokens, sent_2_tokens
    if(justTokens):
      doc_dict_vectors_list.append((sent_1_tokens, sent_2_tokens))
    else:
      dictionary_v = {}
      if headers!= None:
        val = {}
        for idx, header in enumerate(headers):
          val[header] = scores[i][idx]
        # dictionary_v.update(val)
      dictionary_v.update(get_dict_vector_of_2_sentences(sent_1_tokens, sent_2_tokens))
      doc_dict_vectors_list.append(dictionary_v)
  return doc_dict_vectors_list
Exemple #2
0
def jaccard_and_containment_coefficient_evaluate(analyze_type, documents, answers, ngram_size=1, ngram_weighing=False, IDFScores=None,f=None):
  ind = 0
  containment_coefficient_predicted_answers = []
  jaccard_coefficient_predicted_answers = []
  init_doc_count = len(documents)/2
  operated_doc_count = 0
  for i in range(len(documents)/2):
    if(operated_doc_count == 400):
      init_doc_count-=400
      operated_doc_count=0
      print str(init_doc_count) + " sets remaining"
    operated_doc_count+=1
    document1, document2 = documents[(2*i)], documents[(2*i)+1]
    #print document1, document2
    sent_1_tokens = process.tokens(document1)
    sent_2_tokens = process.tokens(document2)
    if(analyze_type=="pos"):
      jaccard_coefficient, containment_coefficient = ngram.POSTags_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores)
    elif(analyze_type=="lemma"):
      jaccard_coefficient, containment_coefficient = ngram.Lemma_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, ngram_weighing, IDFScores)
    elif(analyze_type=="character"):
      jaccard_coefficient, containment_coefficient = ngram.character_ngram_JaccardCoefficient_and_containment_coefficienct(sent_1_tokens, sent_2_tokens, ngram_size, True, ngram_weighing, IDFScores)
    jaccard_coefficient_predicted_answers.append(5*jaccard_coefficient)
    containment_coefficient_predicted_answers.append(5*containment_coefficient)
    #print answers[ind],   "   |   ", jaccard_coefficient_predicted_answers[ind]
    #print answers[ind],   "   |   ", containment_coefficient_predicted_answers[ind]
    ind+=1
  print "Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers))
  f.write("Error in Estimate For Jaccard Coefficient is " + str(utility.evaluate(answers, jaccard_coefficient_predicted_answers)))
  print "Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers))
  f.write("Error in Estimate For Containment Coefficient is " + str(utility.evaluate(answers, containment_coefficient_predicted_answers)))
  return jaccard_coefficient_predicted_answers, containment_coefficient_predicted_answers
Exemple #3
0
def cosinesimilarity_without_TFIDF(document1, document2):
  vector1 = Counter(process.tokens(document1))
  vector2 = Counter(process.tokens(document2))
  len_vector_1 = math.sqrt(sum({k: v**2 for k, v in vector1.items()}.values()))
  len_vector_2 = math.sqrt(sum({k: v**2 for k, v in vector2.items()}.values()))
  cosine_similarity_score = (utility.dict_dotprod(vector1,vector2))/float((len_vector_2*len_vector_1))
  return cosine_similarity_score
Exemple #4
0
def cosinesimilarity(document1, document2, TFIDFScores):
  tokens1 = set(process.tokens(document1))
  tokens2 = set(process.tokens(document2))
  vector1 = DocvectorTFIDF(TFIDFScores, tokens1)
  vector2 = DocvectorTFIDF(TFIDFScores, tokens2)
  len_vector_1 = math.sqrt(sum({k: v**2 for k, v in vector1.items()}.values()))
  len_vector_2 = math.sqrt(sum({k: v**2 for k, v in vector2.items()}.values()))
  cosine_similarity_score = (utility.dict_dotprod(vector1,vector2))/float((len_vector_2*len_vector_1))
  return cosine_similarity_score
Exemple #5
0
def CharacterIDFVector(documents, ngram_size=2):
  No_of_Documents = float(len(documents))
  n = len(documents)
  IDFVector = Counter()
  for document in documents:
    tokens = process.tokens(document)
    # print str(n) + " Documents remaining to process"
    n-=1
    IDFVector += Counter(set(utility.character_ngram_vector_keys(tokens, ngram_size)))
  #print IDFVector
  for key in IDFVector.keys():
    IDFVector[key] = math.log(No_of_Documents/(1+IDFVector[key]))
  return IDFVector
Exemple #6
0
def TFIDF(documents):
  Vocabulary = Counter()
  DocVectors = []
  IDFVector = Counter()
  No_of_Documents = float(len(documents))
  n = len(documents)
  for document in documents:
    print str(n) + " Documents remaining to process"
    n-=1
    tf_single_doc_count = Counter(process.tokens(document))
    Vocabulary+= tf_single_doc_count
    DocVectors.append(tf_single_doc_count)
    IDFVector += Counter(tf_single_doc_count.keys())
  # print IDFVector
  for key in IDFVector.keys():
    IDFVector[key] = math.log(No_of_Documents/(1+IDFVector[key]))
  # print IDFVector
  # IDFVector = defaultdict(lambda:0.0, dict((key,Vocabulary[key]*) for key in c.keys()))
  TFIDFScores = defaultdict(lambda:0.0, dict((key,Vocabulary[key]*IDFVector[key]) for key in Vocabulary.keys()))
  # print TFIDFScores, IDFVector, Vocabulary
  return TFIDFScores, Vocabulary, DocVectors, IDFVector