def kfcvkNN(mi, k=10):
	correct = []
	tested = []
	tot = 0
	cor = 0
	for i in range(k):
		#mark_test_set(mi, k, i)
		MessageFeatures.test_fold = i
		MessageFeatures.folds = k
		
		tf = TFIDF(mi, 3)
		tf.train1()
		tf.correct = 0
		tf.correct = 0
		c = 0
		t = 0
		for m in mi:
			if m.isTest(mi.num_msgs):
				cl = tf.get_class_kNN(m)
				#print(cl)
				if cl == m.newsgroupnum:
					c+=1
				t+=1
		print(tf.correct)	
		correct.append(c)
		tested.append(t)
		tot+=t
		cor+=c
	print (1.0*cor/tot)
		
	pass
Beispiel #2
0
def pl_preprocessing(total_pl):
    train_data = []
    train_y = []
    NUM_PL = 8
    D_WORD = 300
    tf_idf = TFIDF(total_pl)
    pl_cnt, words = tf_idf.get_tfidf()

    # label
    l = 0

    for field in total_pl:
        #         print(field)
        for num, j in enumerate(field):

            m = get_pl_v(j, pl_cnt, NUM_PL, D_WORD)
            if len(m) == 2400:
                train_data.append(m)
                train_y.append(l)
            else:
                pass
        l += 1
#                             print(i)
#     print(t,s)

    return train_data, train_y
Beispiel #3
0
def testBaseFC(seedUrls, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
Beispiel #4
0
 def get_keywords(self, pageText, count):
         mytfidf = TFIDF()
         tokenPageText = getTokenizedDocs([pageText])
         token_bow = [mytfidf.doc2bow(doc) for doc in tokenPageText]
         mytfidf.buildVocabIndex(token_bow)
         selected = mytfidf.selectImportantWords_tf(count)
         wordsList = mytfidf.index.keys()
         selected_words = [wordsList[k[1]] for k in selected]
         return selected_words
Beispiel #5
0
    def __init__(self, file_name):
        """Creates a search engine backed by PageRank and TF-IDF

        Args:
            file_name: path to xml files of wiki dump
        """
        # build corpus from xml files
        self.corpus, self.links = build_corpus(file_name)
        self.tf_idf = TFIDF(self.corpus)
        print("TFIDF engine has started")
        self.reverse_index = {word: set(mapping.keys())
                              for word, mapping in self.tf_idf.tf_idf.items()}
        self.page_rank = PageRank(self.links, self.tf_idf.tf_idf)
        print("PageRank engine has started")
Beispiel #6
0
def ask_question(qs_input, top_k):
    """
    Ask one question and generate response for tfidf, lm and cnn
    """

    print("Question : %s" % qs_input)
    print("Top k : : %d" % top_k)

    random.seed(12345)
    retrieval_data_start_time = time.clock()
    questions, pred_questions, answers, pred_answers = Data.read_pred_data(
        "Data/pred_QA-pair.csv")
    # Build word --> sentence dictionary
    word_sentence_dict = Data.generate_word_sentence_dict(pred_questions)

    print("Retrieval Data Finished")

    retrieval_data_end_time = time.clock()
    print("Retrieval Data cost %f" %
          (retrieval_data_end_time - retrieval_data_start_time))

    response_start_time = time.clock()

    lm = LM(questions, pred_questions, answers, pred_answers,
            word_sentence_dict)
    tfidf = TFIDF(questions, pred_questions, answers, pred_answers,
                  word_sentence_dict)
    cnn = CNN(questions,
              pred_questions,
              answers,
              pred_answers,
              word_sentence_dict,
              isTrain=False)

    _, lm_response = lm.ask_response(qs_input, top_k=top_k)
    tfidf_response_id, tfidf_response = tfidf.ask_response(qs_input,
                                                           top_k=top_k * 10)
    cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id)

    for i in range(top_k):
        print("LM response %d: %s" % (i + 1, lm_response[i]))
    for i in range(top_k):
        print("TFIDF response %d: %s" % (i + 1, tfidf_response[i]))
    for i in range(top_k):
        print("CNN response %d: %s" % (i + 1, cnn_response[i]))

    print("Response Finished")

    response_end_time = time.clock()
    print("Response cost %f" % (response_end_time - response_start_time))
Beispiel #7
0
def baseFC_OneTargetVector(crawlParams):
    seedURLs = crawlParams['seedURLs']
    t = [(-1, p, -1, "") for p in seedURLs]
    priorityQueue = PriorityQueue(t)

    crawlParams["priorityQueue"] = priorityQueue
    mytfidf = TFIDF()

    mytfidf.buildModel(crawlParams['model'], crawlParams['No_Keywords'])
    #mytfidf.buildModel(crawlParams['seedURLs'],crawlParams['No_Keywords'])
    crawlParams['scorer'] = mytfidf

    #crawler = Crawler(priorityQueue,scorer,options)
    crawler = Crawler(crawlParams)
    crawler.crawl()
    return crawler.relevantPages
Beispiel #8
0
 def __init__(self, modelInstance):
     self.model = modelInstance
     features = [
         cosine_similarity.CosineSimilarity(),
         n_gram_matching.NGramMatching(),
         sentiment_feature.SentimentFeature(),
         SVD.SVD(),
         TFIDF.TFIDF(),
         baseline_features.BaselineFeature(),
         cue_words.CueWords()
     ]
     self.features_train = np.hstack(
         [feature.read() for feature in features])
     self.labels_train = DataSet(path="../FNC-1").get_labels()
     self.features_test = np.hstack(
         [feature.read('competition_test') for feature in features])
     self.labels_test = DataSet(path="../FNC-1",
                                name="competition_test").get_labels()
Beispiel #9
0
    def get_similarity(self, matrix=None, langue=None):

        if langue == None:
            self.set_sparse_matrix(matrix)

        else:
            tfidf = TFIDF(matrix, langue)
            sparse_matrix = tfidf.get_sparse_matrix()
            self.set_sparse_matrix(sparse_matrix)

        if self.dr == False:
            dimensionality_reduction = int(round((len(matrix[0]) * 3 / 4)))
            self.set_dimensionality_reduction(dimensionality_reduction)

        permutation_matrix = self._get_permutation_matrix()
        signature_matrix = self._get_signature_matrix(permutation_matrix)
        similarity_matrix = self._get_similarity_matrix(signature_matrix)
        return similarity_matrix
def get_dominating_words(context_dict, corpusdir):
    tfidf = TFIDF(corpusdir)
    dominating = init_dict()
    cache = dict()
    for t in context_dict.keys():
        contexts = context_dict[t]
        for c in contexts:
            curr_max = (None, -1)
            for tok in c:
                if tok == "-ENT-":
                    break
                if not cache.has_key(tok):
                    cache[tok] = tfidf.idf(tok)
                if cache[tok] > curr_max[1]:
                    curr_max = (tok, cache[tok])
            if curr_max[0] != None:
                dominating[t].append(curr_max[0])
    return dominating
Beispiel #11
0
def main():
    #########Input and Output##########                                              #IMPORTANT!
    lilypath = 'lily'  #IMPORTANT! Set your own lily path and stopWords
    stopWordspath = 'Chinese-stop-words.txt'  #IMPORTANT!
    stopWords = codecs.open(stopWordspath, 'r', 'gbk')
    inputfile = {}
    outputfile = {}
    filenames = os.listdir(lilypath)
    cnt = 0
    for filename in filenames:
        inputfile[cnt] = codecs.open(lilypath + '/' + filename, 'r', 'utf-8')
        outputfile[cnt] = open(filename, 'w+')
        cnt += 1
    #############TFIDF#############
    TFIDF(inputfile, outputfile, stopWords, cnt)  #The TFIDF algorithem
    for i in range(0, cnt):
        inputfile[i].close()
        outputfile[i].close()
    stopWords.close()
Beispiel #12
0
def testEventFC(seedUrls, pLimit, eventTree):
    #print 'GIVEN TREE:'
    #print eventTree
    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = os.open(eventFile, os.O_CREAT | os.O_RDWR)
    os.write(fw, eventTree.lower())
    os.close(fw)

    mytfidf = TFIDF()  # appears to work fine (called then exited)

    myEventScorer = EventScorer.EventScorer()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs

    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    #print 'cleandocs'
    #print cleandocs
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
def tfidf(mi):
	MessageFeatures.test_fold = -1
	tf = TFIDF(mi, 3)
	tf.train1()
	cj = 0
	cj_count = 0
	tf.correct = 0
	cj = 0
	cj_count = 0
	tf.correct = 0
	for m in mi:
		if cj_count >= 20:
			cj_count = 0
			cj += 1
		elif m.newsgroupnum == cj:
			cj_count += 1
			c = tf.get_class_kNN(m)
			print(c)
	print(tf.correct)	
	pass
def testEventFC(seedFile, pLimit):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()
    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
Beispiel #15
0
def testBaseFC(seedUrls, pLimit):
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = os.open(seedFile, os.O_CREAT | os.O_RDWR)
    os.write(f, seedUrls)
    os.close(f)

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.4
    urlScoreThreshold = 0.4
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    #print urls_tokens
    #print title_tokens

    cleandocs = getTokenizedDocs(docs)

    pos = cleandocs

    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    #eventFC(myEventScorer, mytfidf, options)
    baseFC(mytfidf, options)
def part1(documentPath, maximumDocuments=0):
  startTime = time.time()
  print("Executing code for Part 1...\n")

  print("Extracting data from XML Document...")
  values = XMLParse(documentPath, maximumDocuments)
  print("Number of Documents: "+str(len(values)))
  extractionTime = round(time.time() - startTime, 3)
  print("Time: " + str(extractionTime) + " seconds")

  print("Removing stopwords and stemming...")
  for i in range(len(values)-1, -1, -1):
    if values[i].hasField('BODY'):
      values[i].setField('BODY',removeStopwords(values[i].getField("BODY")))
    else:
      del values[i]
  removingTime = round(time.time() - startTime - extractionTime, 3)
  print("Time: " + str(removingTime) + " seconds")

  print("Creating list of all unique words in corpus...")
  uniqueWords = getUniqueWords(values)
  uniqueWordsTime = round(time.time() - startTime - extractionTime - removingTime, 3)
  print("Time: " + str(uniqueWordsTime) + " seconds")

  print("Computing TF, IDF, and TFIDF...")
  computedTFIDF = TFIDF(values, uniqueWords)
  idfTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime, 3)
  print("Time: " + str(idfTime) + " seconds")

  print("Computing Cosine Similarity...")
  computedTFIDF.calculateCosineSimilarity()
  #computedTFIDF.printVal('sim', 19)
  cosineSimTime = round(time.time() - startTime - extractionTime - removingTime - uniqueWordsTime - idfTime, 3)
  print("Time: " + str(cosineSimTime) + " seconds")


  print('\nPart 1 Complete')
  print("Execution Time: " + str(round(time.time() - startTime, 3)) + " seconds\n")
  return computedTFIDF
Beispiel #17
0
def testEventFC(seedUrls, pLimit, eventTree):
    print 'Content-Type: text/plain\n\n'
    mytfidf = TFIDF()
    myEventScorer = EventScorer.EventScorer()

    # Write the seedUrls to a file
    seedFile = 'addurls.txt'
    if os.path.isfile(seedFile):
        os.remove(seedFile)
    f = open(seedFile, "rw")
    f.write(seedUrls)
    f.close()

    # Write the Event Tree to file
    eventFile = 'event-details.txt'
    if os.path.isfile(eventFile):
        os.remove(eventFile)
    fw = open(eventFile, "rw")
    fw.write(eventTree)
    fw.close()

    docs = downloadRawDocs(seedFile)
    seedURLs = getSeedURLs(seedFile)
    print seedURLs
    pagesLimit = pLimit
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    # set threshold so scorer knows when to print tree to file
    myEventScorer.set_threshold(pageScoreThreshold)
    options = {
        "num_pages": pagesLimit,
        "pageScoreThreshold": pageScoreThreshold,
        "urlScoreThreshold": urlScoreThreshold,
        "seeds": seedURLs
    }
    cleandocs = getTokenizedDocs(docs)
    mytfidf.buildModel(cleandocs)
    eventFC(myEventScorer, mytfidf, options)
Beispiel #18
0
def test():
    mytfidf = TFIDF()
    docs = downloadRawDocs("typhoon_haiyan_SEED_URLs.txt")
    seedURLs = getSeedURLs("typhoon_haiyan_SEED_URLs.txt")
    pagesLimit = 1000
    pageScoreThreshold = 0.5
    urlScoreThreshold = 0.4
    options = {"num_pages": pagesLimit,"pageScoreThreshold":pageScoreThreshold,"urlScoreThreshold":urlScoreThreshold , "seeds":seedURLs}
    #print urls_tokens
    #print title_tokens    
    
    cleandocs = getTokenizedDocs(docs)
    
    pos = cleandocs
    
    #print len(pos)
    #print len(neg)
    #print pos
    mytfidf.buildModel(pos)
    #mytfidf.buildModel(cleandocs)
    #mytfidf.buildModel(cleandocs,urls_tokens,title_tokens)
    
    baseFC(mytfidf,options)
Beispiel #19
0
def get_Dnn_model(total_pl):

    NUM_PL = 8
    D_WORD = 300
    tf_idf = TFIDF(total_pl)
    pl_cnt, words = tf_idf.get_tfidf()

    x, y = pl_preprocessing(total_pl, NUM_PL)
    x = np.array(x)
    y = np.array(y)

    ###### test 同 training data #######
    X_train, X_test1, Y_train, y_test1 = train_test_split(x, y, test_size=0.2)
    data = CrossValidationFolds(X_train, Y_train, FOLDS)
    (X_train1, y_train1), (X_valid1, y_valid1) = data.split()

    ###### test 不同 training data #######
    # data = CrossValidationFolds(x, y, FOLDS)
    # (X_train1, y_train1), (X_valid1, y_valid1) = data.split()

    # X_test1,y_test1 = load_pl('../new_Steeve_data/filter_Dice/can/')
    # X_test1 = np.array(X_test1)
    # y_test1 = np.array(y_test1)

    ##### testing data ######
    # Tx = X_test1[0]
    # Ty = y_test1[0]
    # Tx = Tx.reshape([1,-1])
    # print(Tx.shape)
    # print(X_test1.shape)

    ### 先前設置
    FOLDS = 5
    in_units = D_WORD * NUM_PL
    n_class = 6  # 題目要求只要辨識 0 ,1 ,2 ,3 及4 ,共5個類別

    n_train = len(X_train1)  # train資料的長度
    batch_size = 50
    n_batch = n_train // batch_size

    X = tf.placeholder(tf.float32, [None, in_units],
                       name="X")  # 初始化x資料型態為[None,784]
    y = tf.placeholder(tf.int64, shape=(None), name="y")  # 初始化y資料型態[None]

    logits = L_layers_model(X, 128, n_class, 0.5)
    Y_proba = tf.nn.softmax(logits, name="Y_proba")
    loss, train_op = train_op(y, logits)
    accuracy, precision, recall = acc_model(y, logits)

    prediction = tf.argmax(Y_proba, 1)

    saver = tf.train.Saver()  # call save function
    config = tf.ConfigProto(device_count={'GPU': 1})  #指定gpu

    # Params for Train
    epochs = 1000  # 10 for augmented training data, 20 for training data
    val_step = 100  # 當 50 步時去算一次驗證資料的正確率

    # Training cycle
    max_acc = 0.  # Save the maximum accuracy value for validation data
    early_stop_limit = 0  # 紀錄early_stop的值

    init = tf.global_variables_initializer()
    init_l = tf.local_variables_initializer()

    with tf.Session(config=config) as sess:
        run(sess, X_train1, y_train1, X_valid1, y_valid1)
        sess.run(init_l)
        saver.restore(sess, '../dnn_model.ckpt')  # 開啟剛剛 early_stop 的 model

        print('Acc_test :',
              sess.run(accuracy, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
        print('Prec_value :',
              sess.run(precision, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
        print('Recall_value :',
              sess.run(recall, feed_dict={
                  X: X_test1,
                  y: y_test1
              }))
Beispiel #20
0
def cnn_output(input_file_name, output_file_name, output_num, top_k):
    """
    Generate cnn outputs
    """
    random.seed(12345)
    retrieval_data_start_time = time.clock()
    questions, pred_questions, answers, pred_answers = Data.read_pred_data(
        input_file_name)
    # Build word --> sentence dictionary
    word_sentence_dict = Data.generate_word_sentence_dict(pred_questions)

    print("Retrieval Data Finished")

    retrieval_data_end_time = time.clock()
    print("Retrieval Data cost %f" %
          (retrieval_data_end_time - retrieval_data_start_time))

    cnn_response_start_time = time.clock()

    tfidf = TFIDF(questions, pred_questions, answers, pred_answers,
                  word_sentence_dict)
    cnn = CNN(questions,
              pred_questions,
              answers,
              pred_answers,
              word_sentence_dict,
              isTrain=False)

    if output_file_name.split(".")[-1] == "txt":
        output = open(output_file_name, "w")
        for i in range(output_num):
            qs_index = int(random.random() * len(questions))
            qs_input = questions[qs_index].encode("utf-8")
            output.write("Question : %s\n" % qs_input)
            tfidf_response_id, tfidf_response = tfidf.ask_response(
                qs_input, top_k * 10)
            cnn_response = cnn.ask_response(qs_input, top_k, tfidf_response_id)

            for i in range(top_k):
                output.write("CNN response %d: %s\n" %
                             (i + 1, cnn_response[i].encode("utf-8")))
            output.write("\n")
        output.close()
        cnn_response_end_time = time.clock()
        print("CNN response cost %f" %
              (cnn_response_end_time - cnn_response_start_time))

    if output_file_name.split(".")[-1] == "csv":
        with open(
                output_file_name,
                'w',
        ) as csvfile:
            fieldnames = ['Question']
            fieldnames.extend(["Reply " + str(i + 1) for i in range(top_k)])
            fieldnames.append("Score")
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()

            for i in range(output_num):
                dict = {"Score": ""}
                qs_index = int(random.random() * len(questions))
                qs_input = questions[qs_index].encode("utf-8")
                dict["Question"] = qs_input
                tfidf_response_id, tfidf_response = tfidf.ask_response(
                    qs_input, top_k * 10)
                cnn_response = cnn.ask_response(qs_input, top_k,
                                                tfidf_response_id)

                for i in range(min(top_k, len(cnn_response))):
                    dict["Reply " +
                         str(i + 1)] = cnn_response[i].encode("utf-8")
                writer.writerow(dict)
Beispiel #21
0
def main():

    command = sys.argv[1]
    no = int(sys.argv[2])
    if command == "CP":
        if no == 1:

            tensor = MovieTensor(1)
            tensor.getTensor()
        elif no == 2:
            tensor = MovieTensor(2)
            tensor.getTensor()
    elif command == "SVD":
        allactormoviesdata = pandas.read_csv("movie-actor.csv")

        alltagsdata = pandas.read_csv("mltags.csv")

        allactormoviesdata['max_actor_rank'] = allactormoviesdata.groupby(
            ['movieid'])['actor_movie_rank'].transform(max)
        allactormoviesdata['min_actor_rank'] = allactormoviesdata.groupby(
            ['movieid'])['actor_movie_rank'].transform(min)

        allactormoviesdata['actor_rank_weightage'] = allactormoviesdata.apply(
            compute_actor_weightage, axis=1)
        #
        # print(allactormoviesdata)

        min_timestamp = pandas.to_datetime(min(alltagsdata['timestamp']))
        max_timestamp = pandas.to_datetime(max(alltagsdata['timestamp']))

        alltagsdata['timestamp_weightage'] = alltagsdata.apply(
            CalculateTimestampWeights,
            axis=1,
            args=(min_timestamp, max_timestamp))

        mergeddata = allactormoviesdata[[
            'actorid', 'movieid', 'actor_rank_weightage'
        ]].merge(alltagsdata[['movieid', 'tagid', 'timestamp_weightage']],
                 on='movieid')

        #print(mergeddata[mergeddata['actorid'].isin([878356,1860883,316365,128645])])

        mergeddata['total_weightage'] = mergeddata.apply(
            aggregate_tf_weightages, axis=1)

        mergeddata['tag_weightage'] = mergeddata.groupby(
            ['actorid', 'tagid'])['total_weightage'].transform('sum')
        tfdata = mergeddata[['actorid', 'tagid', 'tag_weightage'
                             ]].drop_duplicates(subset=['tagid', 'actorid'])

        tfdata['total_weightage_actor'] = tfdata.groupby(
            ['actorid'])['tag_weightage'].transform('sum')

        tfdata['tf'] = tfdata.apply(ComputeTF, axis=1)

        taglist = tfdata['tagid'].tolist()
        alltagsdata = pandas.read_csv("mltags.csv")
        alltagsdata = alltagsdata[alltagsdata['tagid'].isin(taglist)]

        #print(alltagsdata)

        allactormoviesdata = pandas.read_csv("movie-actor.csv")
        requiredtagsdata = alltagsdata.merge(allactormoviesdata, on='movieid')

        requiredtagsdata.drop_duplicates(subset=['tagid', 'actorid'],
                                         inplace=True)
        requiredtagsdata['actor_count'] = requiredtagsdata.groupby(
            'tagid')['actorid'].transform('count')
        requiredtagsdata.drop_duplicates(subset=['tagid'], inplace=True)

        actordata = pandas.read_csv("imdb-actor-info.csv")
        total_actors = actordata.shape[0]

        requiredtagsdata['idf'] = requiredtagsdata.apply(
            ComputeIDF, axis=1, total_actors=total_actors)
        #
        # print(total_actors)
        # print(requiredtagsdata)

        tfidfdata = ProcessTFandIDFtoTFIDF(tfdata,
                                           requiredtagsdata[['tagid', 'idf']])

        # print(tfdata)

        #tfidfdata = tfidfdata[tfidfdata['actorid'].isin([878356,1860883,316365,128645])]

        #print(tfidfdata)

        actor_tag_matrix = tfidfdata.pivot_table(index='actorid',
                                                 columns='tagid',
                                                 values='tfidf',
                                                 fill_value=0)
        print "Actor Tag Matrix"
        print actor_tag_matrix

        tf = TFIDF("", 1, "_actor_")
        tf.calcMoviesVector()
Beispiel #22
0
corpus = []
for text in new_df['content']:
    corpus.append(text)

titles = []
for title in new_df["title"]:
    titles.append(str(title))
#labels_df starts at df[5000] so we're good on the matching of labels to content
events = []
for event in labels_df["Event"][:1000]:
    events.append(str(event))


from TFIDF import TFIDF
#creates TFIDF matrix
TFIDF(corpus)

##############################################################################
###################KMEANS#####################################################
##############################################################################
from sklearn.externals import joblib
#Loads my pre-existing kmeans model
#Saves the model you just made
#joblib.dump(km, '700_No_Ngram.pkl')
km = joblib.load("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/KMeans_Cluster_Models/350_no_Ngram.pkl")
clusters = km.labels_.tolist()



#Only to create a new kmeans model
from sklearn.cluster import KMeans
Beispiel #23
0
    faq = pd.read_csv('../data/interim/faq-text-separated.csv',
                      keep_default_na=False)
    test_questions = pd.read_csv('../data/test/test-questions.csv')
    features = ['Topic', 'Category', 'Department', 'question', 'answer']

    test_topics = pd.read_excel(
        '../../../Inquire Boulder request data- detailed open and closed - for research purposes.xlsx'
    )
    test_topics = test_topics[['Description', 'Topic']]
    test_topics = test_topics.rename(index=str,
                                     columns={
                                         "Description": "test_question",
                                         "Topic": "match_topic"
                                     })

    #     # Evaluate KDTree on questions
    #     kdtree = KDTREE(faq, features, 'KDTREE')
    #     kdtree.evaluate(test_questions, 'questions')

    #     # Evaluate Word2Vec on questions
    #     w2v = W2V(faq, features, 'W2V')
    #     w2v.evaluate(test_questions, 'questions')

    #     w2v.evaluate(test_topics, 'topics')

    # Evaluate TFIDF on questions and Topics
    tfidf = TFIDF(faq, features, 'TFIDF')
    tfidf.evaluate(test_questions, 'questions')

#     tfidf.evaluate(test_topics, 'topics')
Beispiel #24
0
from ExtractAbstract import ExtractAbstract
from InformationContent import InformationContent

from TFIDF import TFIDF as TFIDF
from ClusterRelatedness import ClusterRelatedness
from DimensionRelatedness import DimensionRelatedness
from RelatednessGraph import RelatednessGraph

if __name__ == "__main__":
    '''
		PART 1 - 1
		Calculate IC and TFIDF
	'''
    IC = InformationContent("./source/ic.txt")
    #DEBUG: IC.printSortedList()
    TfIdf = TFIDF("./source/tfidf.txt")
    #DEBUG: TfIdf.printSortedList()
    '''
		PART 1 - 2
		Use IC and TFIDF to extract words from abstracts
	'''
    Extractor = ExtractAbstract("./source/corpus5.csv", IC, TfIdf, 0.35,
                                0.3)  #IC THReshold / TFIDF Threst
    '''
		PART 2
		Calculate Relatedness
	'''
    # Finding Relatedness 1 - Find Vector Cluster
    # ClusterRelatedness = ClusterRelatedness("./source/vectors.txt", "./abstracts/", Extractor.fileNum)

    # Finding Relatedness 2 - Compare Word Pairs
 def __init__(self, model):
     self.model = model
     self.db = DBConnect()
     self.tfIdf = TFIDF("", "", "_actor_")
def get_tfidf():
    total_data = get_raw_pl()
    tf_idf = TFIDF(total_data)
    tfidf_scores, words = tf_idf.get_tfidf()

    return tfidf_scores
Beispiel #27
0
corpus = []
for text in new_df['content']:
    corpus.append(text)

titles = []
for title in new_df["title"]:
    titles.append(str(title))
#labels_df starts at df[5000] so we're good on the matching of labels to content
events = []
for event in labels_df["Event"][:1000]:
    events.append(str(event))

import os
os.chdir("/Users/parkerglenn/Desktop/DataScience/Article_Clustering/Pre-Processing")
from TFIDF import TFIDF
tfidf_matrix = TFIDF(corpus)



####################################################################
##########################HAC#######################################
####################################################################
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
hac = AgglomerativeClustering(n_clusters=500, affinity = "euclidean")
dense_matrix = tfidf_matrix.todense()
hac.fit_predict(dense_matrix)

from sklearn.externals import joblib
#Saves the model you just made
joblib.dump(hac, '350_euc_HAC.pkl')