def findDistribution(N=0.3,length=1000,sampled=400):
  sentences = []
  for s in df['Sentence']:
    sentences.append(s)
    if(len(sentences)==length):
      break

  print("[INFO] No of sentences= "+str(len(sentences)))
  vectorizer = Vectorizer()
  vectorizer.bert(sentences)
  vectors_bert = vectorizer.vectors
  data=[]

  for i in range(length):
    for j in range(i+1,length):
      dist = spatial.distance.cosine(vectors_bert[i], vectors_bert[j])
      data.append([i+1,j+1,dist])
    if(((i+1)/length * 100)%10==0):
      print(str((i+1)/length * 100)+" % done")  
  data_sorted=sorted(data,key=l2,reverse=True) 

  G = snap.TUNGraph.New()
  for i in range(length):
    G.AddNode(i)

  val=int(length*N)
  for i in range (val):
    G.AddEdge(data_sorted[i][0],data_sorted[i][1]) 


  PRankH = G.GetPageRank()

  adj=dict()
  for i in G.Nodes():
    adj[i.GetId()]=[]

  for id in G.Nodes():
    i=id.GetId()
    for w in id.GetOutEdges():
      adj[i].append(w)

  pagerank=dict()
  for item in PRankH:
      pagerank[item]= PRankH[item]

  final=[]
  while(len(final)<sampled):
    pr_list=makelist(pagerank)
    pr_list=sort(pr_list)
    val=pr_list[0][0]
    for u in adj[val]:
      if u in pagerank:
        pagerank[u]*=0.8
    pagerank.pop(val)
    final.append(val) 

  counts=dict()
  for i in range(7):
    counts[i]=0
  for i in final:
    counts[df_label.iloc[i,1]]+=1

  return counts
Exemple #2
0
from sent2vec.vectorizer import Vectorizer
import pandas as pd
import time

# setup
sentence_data = pd.read_csv("./data/tasks/sentence_correction/task_data.csv")
whole_sentences = []

if __debug__:
    print(sentence_data.columns)
    start_time = time.time()

# each "row" contains its "values" as list item
# save corrected sentences to "whole_sentences"
for row, values in sentence_data.iterrows():
    whole_sentences.append(values[2].format(values[3].strip("{}")))

sentence_data["sentence_corpus"] = whole_sentences

# create vectorized items and save them as list
vectorizer = Vectorizer()
vectorizer.bert(sentence_data["sentence_corpus"])
sentence_data["sentence_vectors"] = vectorizer.vectors.tolist()

if __debug__:
    print(sentence_data.index)
    end_time = time.time() - start_time
    print(end_time)

sentence_data.to_pickle("pickled_sentences")
Exemple #3
0
    stat = group(d.split(' '), 4)
    f = []
    for s in stat:
        f.extend(pea_pick(s))
    f = [ff for ff in f if len(ff) > 0]
    pairs = group(f, 2)
    pairs = [[' '.join(pp) for pp in p] for p in pairs]
    return pairs


for sent in data:

    sentences = chunky(sent)

    dist = []
    for p in sentences:
        print(p)
        if len(p) > 1:
            vectorizer = Vectorizer()
            vectorizer.bert(p)
            vectors_bert = vectorizer.vectors

            dist.append(
                spatial.distance.cosine(vectors_bert[0], vectors_bert[1]))
            print(dist[-1])

    avg = int(np.average(dist) * 1000000)

    print("Sentence: " + sent)
    print(avg)
    print()
Exemple #4
0
#Change this to ' for x in range(1): ' to rapidaly test on first 2 queries 
for x in range(49): #iterate queries again
    print("starting query...")
    '''
    docidarray = [] #one for each query
    for i in range (1000): #again the top 1000 results
        docid = results[i][0]
        docidarray.append(docid)
    '''
    
    #do the bert encoding
    docVectors = encodeBERT(firstResultsList[x], Documents) #-> [['bert processed tweet', doc id],..]

    #print (numpy.array(docVectors[1][0]))
    #encode query 
    vectorizer = Vectorizer()
    vectorizer.bert(queriesLst[x][1]) #current query
    queryVect = vectorizer.vectors
    newRank = []
    
    for i in range (len(docVectors)): #calculate vector length

        dist = spatial.distance.cosine(queryVect[0], numpy.array(docVectors[i][0]))
        newRank.append([dist, docVectors[i][1]]) #-> appends [similarity distance, doc id]

    #3.rank the docs again based on scores (use sorted() function)
    sortedNewRank = sorted(newRank) 
    newDocRankingList.append(sortedNewRank)
    print(newDocRankingList)

    #4. write to results file
Exemple #5
0
def sent2vec_feature(utterances):
    vectorizer = Vectorizer()
    vectorizer.bert(utterances)
    return vectorizer.vectors
    topic_info_dict: Dict[str, str] = file_operation.extract_file()

    result_dict = {}

    tokens_dict: Dict[str,
                      List[str]] = tokenizer.tokenize_bert(topic_info_dict)

    train_query, test_query = file_operation.extract_queries_for_bert()

    nltk.download('punkt')
    for topic_id in train_query:
        result_dict[topic_id] = {}
        train_query[topic_id] = nltk.tokenize.sent_tokenize(
            train_query[topic_id])

    vectorizer = Vectorizer()
    for topic_id in train_query:
        topic_id = str(topic_id)
        executor = ThreadPoolExecutor(len(tokens_dict))
        num_of_sentence_query = len(train_query[topic_id])
        i = 0
        ths = []

        temp_bm25_dict = bm25_dict[topic_id]
        sorted_bm25 = list(
            dict(
                sorted(temp_bm25_dict.items(),
                       key=lambda item: item[1],
                       reverse=True)).keys())[:100]

        for doc_id in sorted_bm25:
def add(request):
    if request.FILES:
        uploadedFile = request.FILES['fname']
        sepsent = get_sentence_from_file(uploadedFile)
        print(sepsent)
        i = 1
        Svectors = []  # list of all bert vectors
        centroids = {}  # list of centers of clusters

        # for each cluster we defined separate distance and vector list
        Dist1 = []
        Dist2 = []
        Dist3 = []
        Dist4 = []
        Cluster1 = []
        Cluster2 = []
        Cluster3 = []
        Cluster4 = []

        # showing the progress information on the display
        i = 0

        # we take each sentence from the list and calculate its bert representation in vector
        for x in sepsent:
            progress_percent = round(((i * 100) / len(sepsent)), 2)
            remained_time_h = int(((7 * len(sepsent)) - (i * 7)) / 3600)
            remained_time_m = ((7 * len(sepsent)) - (i * 7)) % 3600
            print(' ----------------  progress :' + str(progress_percent) +
                  '% ---------remaining time(hh:mm): ' + str(remained_time_h) +
                  ':' + str(remained_time_m) + ' ------',
                  end='\r')
            i = i + 1

            vectorizer = Vectorizer()
            vectorizer.bert(x)
            vectors_bert = vectorizer.vectors
            Svectors.append(vectors_bert[0])
        # we took 4 random centers for k means algorithm
        if os.path.isfile('center_json_data.json'):
            pfc = open('center_json_data.json')
            jcenter = json.load(pfc)
            pfc.close()
            centroid1 = jcenter['center1']
            centroid2 = jcenter['center2']
            centroid3 = jcenter['center3']
            centroid4 = jcenter['center4']

        else:
            centroid1v = sum(Svectors) / len(Svectors)
            centroid2v = sum(Svectors) / (len(Svectors) / 2)
            centroid3v = sum(Svectors) / (len(Svectors) / 10)
            centroid4v = sum(Svectors) / (len(Svectors) / 28)
            centroid1 = centroid1v.tolist()
            centroid2 = centroid2v.tolist()
            centroid3 = centroid3v.tolist()
            centroid4 = centroid4v.tolist()

        print(centroid1)

        # creating json format for them to save them later

        lock1 = 0
        lock2 = 0
        lock3 = 0
        lock4 = 0

        loop_no = 0

        while True:
            print('---cluster:---')
            print(len(Cluster1))
            print(len(Cluster2))
            print(len(Cluster3))
            print(len(Cluster4))

            print('----------------')

            print('#######################')
            if len(Cluster1) > 0:
                if (centroid1 != (sum(Cluster1) / len(Cluster1))).all():
                    centroidiv1 = sum(Cluster1) / len(Cluster1)
                    centroid1 = centroidiv1.tolist()

                else:
                    lock1 = 1
            else:
                if loop_no > 100:
                    lock1 = 1
            if len(Cluster2) > 0:
                if (centroid2 != (sum(Cluster2) / len(Cluster2))).all():
                    centroidiv2 = sum(Cluster2) / len(Cluster2)
                    centroid2 = centroidiv2.tolist()

                else:
                    lock2 = 1
            else:
                if loop_no > 100:
                    lock2 = 1
            if len(Cluster3) > 0:
                if (centroid3 != (sum(Cluster3) / len(Cluster3))).all():
                    centroidiv3 = sum(Cluster3) / len(Cluster3)
                    centroid3 = centroidiv3.tolist()

                else:
                    lock3 = 1
            else:
                if loop_no > 100:
                    lock3 = 1
            if len(Cluster4) > 0:
                if (centroid4 != (sum(Cluster4) / len(Cluster4))).all():
                    centroidiv4 = sum(Cluster4) / len(Cluster4)
                    centroid4 = centroidiv4.tolist()

                else:
                    lock4 = 1
            else:
                if loop_no > 100:
                    lock4 = 1
            Dist1.clear()
            Cluster1.clear()
            Dist2.clear()
            Cluster2.clear()
            Dist3.clear()
            Cluster3.clear()
            Dist4.clear()
            Cluster4.clear()
            for x in Svectors:

                Tdist1 = spatial.distance.cosine(centroid1, x)
                Tdist2 = spatial.distance.cosine(centroid2, x)
                Tdist3 = spatial.distance.cosine(centroid3, x)
                Tdist4 = spatial.distance.cosine(centroid4, x)

                if Tdist1 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist1.append(Tdist1)
                    Cluster1.append(x)
                elif Tdist2 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist2.append(Tdist2)
                    Cluster2.append(x)
                elif Tdist3 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist3.append(Tdist3)
                    Cluster3.append(x)
                elif Tdist4 == min([Tdist1, Tdist2, Tdist3, Tdist4]):
                    Dist4.append(Tdist4)
                    Cluster4.append(x)
            print('---lock---')
            print(lock1)
            print(lock2)
            print(lock3)
            print(lock4)
            loop_no = loop_no + 1
            if lock1 == 1 and lock2 == 1 and lock3 == 1 and lock4 == 1:
                print('break')
                break

        json_center = {
            'center1': centroid1,
            'center2': centroid2,
            'center3': centroid3,
            'center4': centroid4,
        }

        with open('center_json_data.json', 'w') as fc:
            json.dump(json_center, fc)
        fc.close()

        if os.path.isfile('meanDistance_json_data.json'):
            pfd = open('meanDistance_json_data.json')
            jdist = json.load(pfd)
            previous_dist1 = jdist['dist1']
            previous_dist2 = jdist['dist2']
            previous_dist3 = jdist['dist3']
            previous_dist4 = jdist['dist4']
            if previous_dist1 != 0:
                Dist1.append(previous_dist1)
            if previous_dist2 != 0:
                Dist2.append(previous_dist2)
            if previous_dist3 != 0:
                Dist3.append(previous_dist3)
            if previous_dist4 != 0:
                Dist4.append(previous_dist4)

        if len(Dist1) > 0:
            MeanDist1 = sum(Dist1) / len(Dist1)
        else:
            MeanDist1 = 0
        if len(Dist2) > 0:
            MeanDist2 = sum(Dist2) / len(Dist2)
        else:
            MeanDist2 = 0
        if len(Dist3) > 0:
            MeanDist3 = sum(Dist3) / len(Dist3)
        else:
            MeanDist3 = 0
        if len(Dist4) > 0:
            MeanDist4 = sum(Dist4) / len(Dist4)
        else:
            MeanDist4 = 0

        json_MeanDist = {
            'dist1': MeanDist1,
            'dist2': MeanDist2,
            'dist3': MeanDist3,
            'dist4': MeanDist4,
        }
        with open('meanDistance_json_data.json', 'w') as fd:
            json.dump(json_MeanDist, fd)
        fd.close()

        context = {'center': 'centroi', 'dist': 'MeanDist'}
    else:
        context = {'filename': '', 'dist': ''}
    return render(request, 'ndex.html', context)