Example #1
0
def predict():
    preproses()
    td = TFIDF([xdata, ydata])
    clasification = []

    # Receives the input query from form
    if request.method == 'POST':
        namequery = request.form['namequery']
        spliter = namequery.split(',')

        for row in spliter:
            clasification.append(testFromTrained([td.transform(row)]))
        print(clasification)
        keras.clear_session()

        labels, values = np.unique(clasification, return_counts=True)
        lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(spliter, clasification),
                           legenda=zip(lbls, vals))
def wordcount(filename, ent_file, tfidf, text, id):
    resources = open(filename)
    resources.readline()  # header
    wordcount = TFIDF(get_entities(ent_file))
    for id, lines in groupby(csv.reader(resources), id):
        maintext = ' '.join(text(line).lower() for line in lines)
        wordcount.process(maintext)
    wordcount.done()

    out = open(tfidf, 'w')
    for word, _, _, tfidf in wordcount.highest(200):
        out.write('%s\t%f\n' % (word, tfidf))
Example #3
0
    def preprocess(self, filepath):
        dataset = pd.read_csv(filepath, delimiter=',')

        self.xData = []
        self.yData = []

        for k in dataset['Kalimat']:
            self.xData.append(k)

        for k in dataset['Formalitas']:
            self.yData.append(k)

        self.tfidf_data = TFIDF([self.xData, self.yData])
Example #4
0
    def test_tfidf(self):
        """
        Test the TF-IDF scheme.
        """

        idf = {'a': 2, 'b': 1, 'c': 1}
        tokens = ['a', 'b', 'b', 'c', 'd']
        tfidf = TFIDF(idf, 3)

        document = tfidf.create(tokens)
        self.assertEqual(0, document.dimensions['a'])
        self.assertEqual(0.35218, round(document.dimensions['b'], 5))
        self.assertEqual(0.17609, round(document.dimensions['c'], 5))
        self.assertEqual(0.47712, round(document.dimensions['d'], 5))
Example #5
0
    def parseQuery(self, query, invIndex):
        #Both handlers return the respective TF_IDFs
        #docTF_IDF can be run once after crawl
        tfidf = TFIDF()
        # print invIndex
        docTF_IDF = tfidf.docHandler(invIndex, 0)
        # print docTF_IDF
        queryTF_IDF = self.queryHandler(query, invIndex)
        if queryTF_IDF == -1:
            print "No words from your search were found in any documents...Please try new search terms!"
            return -1

        cosSimByDoc = self.cosSimilarityHandler(docTF_IDF, queryTF_IDF)
        # print "Cosine Similarity by document:", cosSimByDoc
        return cosSimByDoc
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
Example #7
0
    def test_export(self):
        """
        Test exporting and importing the IDF table.
        """

        idf = {'a': 2, 'b': 1, 'c': 1}
        tfidf = TFIDF(idf, 3)

        e = tfidf.to_array()
        self.assertEqual(tfidf.global_scheme.documents,
                         TFIDF.from_array(e).global_scheme.documents)
        self.assertEqual(tfidf.global_scheme.idf,
                         TFIDF.from_array(e).global_scheme.idf)
        self.assertEqual(tfidf.local_scheme.__dict__,
                         TFIDF.from_array(e).local_scheme.__dict__)
        self.assertEqual(tfidf.global_scheme.__dict__,
                         TFIDF.from_array(e).global_scheme.__dict__)
Example #8
0
    def preproses(self, filepath):
        f = open(filepath)

        # split new line
        sents = f.read().split('\n')

        # shuffle all sentences order
        shuffle(sents)

        # on each sentence
        # - split by semicolon
        # - append to variable
        for sent in sents:
            temp = sent.split(';')
            if len(temp) == 2:
                self.xdata.append(temp[0])
                self.ydata.append([int(temp[1])])

        # prepare tfidf feature
        self.tfidf_data = TFIDF([self.xdata, self.ydata])
Example #9
0
def count(district,
          type='essays',
          extract_text=lambda line: ' '.join(line[3:10]),
          id=lambda line: line[0]):
    (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude,
     school_longitude, school_city, school_state, school_zip, school_metro,
     school_district, school_county, school_charter, school_magnet,
     school_year_round, school_nlns, school_kipp, school_charter_ready_promise,
     teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow,
     primary_focus_subject, primary_focus_area, secondary_focus_subject,
     secondary_focus_area, resource_usage, resource_type, poverty_level,
     grade_level, vendor_shipping_charges, sales_tax,
     payment_processing_charges, fulfillment_labor_materials,
     total_price_excluding_optional_support,
     total_price_including_optional_support, students_reached,
     used_by_future_students, total_donations, num_donors,
     eligible_double_your_impact_match, eligible_almost_home_match,
     funding_status, date_posted, date_completed, date_thank_you_packet_mailed,
     date_expiration) = range(46)
    proj_ids = []
    projects = open('../data/projects.%scsv' % district)
    projects.readline().strip()  # header
    for proj in csv.reader(projects):
        if proj[date_posted].startswith('2011'):
            proj_ids.append(proj[0])
    proj_ids = frozenset(proj_ids)
    projects.close()

    wordcount = TFIDF(get_entities(ent_file))
    essays = open('../data/%s.%scsv' % (type, district))
    essays.readline()  # header
    for proid, lines in groupby(csv.reader(essays), id):
        if proid in proj_ids:
            text = ' '.join(extract_text(line) for line in lines).lower()
            wordcount.process(text)
    wordcount.done()
    essays.close()

    out = open('../data/wc_%s%scsv' % (type, district), 'w')
    for word, tf, df, tfidf in wordcount.highest(0):
        out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
def getRecommendation(new_df, record):
    temp_df = new_df[['id','name', 'album', 'artist', 'release_date']]
    temp_df = pd.concat([temp_df, record], ignore_index = True)
    
    col = ['name', 'album', 'artist', 'release_date']
    data = pd.DataFrame(columns=col)
    id = []
    for i in col:
        yield "<br/>"
        tf = TFIDF(temp_df, i)
        cosine_sim = linear_kernel(tf, tf) 
        data[i] = cosine_sim[-1]
        d1 = data.sort_values(by=[i], ascending=False)
        id.append(list(d1.head(7).index))
    
    tid = []
    for i in range(4):
        track_id = []
        for j in id[i]:
            track_id.append(temp_df.iloc[j, 0]) 
        tid.append(track_id)
    return tid 
Example #11
0
def upload_file():
    if request.method == 'POST':
        if 'file' not in request.files:
            flash('Not file part')
            # return redirect(request.url)
        file = request.files['file']

        if file.filename == '':
            flask('not select file')
            # return redirect(request.url)
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            # return redirect(url_for('upload_file', filename=filename))
        print(filename)
        fold = "data/" + filename
        print(fold)
        with open(fold, 'r') as csv_par:
            preproses()
            td = TFIDF([xdata, ydata])
            clasification = []
            csv_reader = csv_par.read().split('\n')

    for row in csv_reader:
        clasification.append(testFromTrained([td.transform(row)]))

    keras.clear_session()
    labels, values = np.unique(clasification, return_counts=True)
    lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(csv_reader, clasification),
                           legenda=zip(lbls, vals))
Example #12
0
def parsing():

    with open('data/test.csv', 'r') as csv_par:
        preproses()
        td = TFIDF([xdata, ydata])
        rowdata = []
        clasification = []
        csv_reader = csv_par.read().split('\n')
    for row in csv_reader:
        rowdata.append(row)
        clasification.append(testFromTrained([td.transform(row)]))

    keras.clear_session()
    labels, values = np.unique(clasification, return_counts=True)
    lbls, vals = np.unique(clasification, return_counts=True)

    pie_labels = labels
    pie_values = values
    colors = ["#F7464A", "#46BFBD"]

    return render_template('hasil.html',
                           set=zip(values, labels, colors),
                           clasification=zip(csv_reader, clasification),
                           legenda=zip(lbls, vals))
def start(tfidf_threshold):

    #initialize TFIDF
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for line in phrase_file:
        index, text = line.split("##")
        token_list = text.lower().strip().split("!!")
        id_phrases[index] = token_list
    phrase_file.close()
    tfidf = TFIDF(id_phrases.values())
    print("TFIDF initialized")

    input_file = open("publications.txt")
    #input_file = open("pub_min.txt")
    while True:
        '''
            Parse paper title.
            Test for EOF.
        '''
        line = input_file.readline().strip()
        if len(line) == 0:
            break
        assert line[:2] == "#*"
        title = line[2:]
        '''
            Parse author.
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#@"
        authors = line[2:].split(',')
        '''
            Parse Year
        '''
        input_file.readline()
        '''
            Parse Venue
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#c"
        venue = line[2:]
        '''
            Parse paper id.
            Do not cast to integer. Simply unnecessary.
        '''
        line = input_file.readline().strip()
        assert line[:6] == "#index"
        id = line[6:]
        id_title[id] = title

        for a in authors:
            dictionary_add_set(author_papers, a, id)
            dictionary_add_set(author_venues, a, venue)
        dictionary_add_set(venue_papers, venue, id)

        paper_venue[id] = venue
        paper_authors[id] = authors
        '''
            Parse citations.
        '''
        line = input_file.readline().strip()
        while line[:2] == "#%":
            '''
                Invalid/empty citation.
            '''
            if len(line) <= 2:
                break
            dictionary_add_set(paper_papers, id, line[2:])
            line = input_file.readline().strip()
        '''
            Read the empty string line so the readline output is not confused with
            EOF.
            Sets the reading pointer to the next paper's title line.
        '''
        line = input_file.readline()
        if line[:2] == "#!":
            input_file.readline()
    '''
        Get terms for each paper.
    '''
    phrase_file = open("text_segmented_by_phrase.txt", "r")
    for paper_id, tok_list in id_phrases.items():
        '''
            Assuming (id, list_of_tokens). If I'm wrong, the code will HCF.
        '''
        toks = [x for x in tok_list if len(x) > 2 and \
                                    tfidf.tf_idf(x) > tfidf_threshold]
        toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=False)
        paper_terms[paper_id] = toks[:min(3, len(toks))]
        for term in paper_terms[paper_id]:
            if not term_papers.has_key(term):
                term_papers[term] = []
            term_papers[term].append(paper_id)

    return paper_authors, \
           paper_papers, \
           paper_venue, \
           author_papers, \
           venue_papers, \
           author_venues
Example #14
0
def reward2(s1, s2):
    indices = corpus[:]
    tfi = TFIDF()
    tfidf = tfi.get_tfidf(corpus)
    score = tfi.relevancy(tfidf, indices, s1, s2)
    return score + 1
Example #15
0
def get_pred_api_set(desc):
    tfidf = TFIDF(desc).gen_vector()
    cluster = Match(tfidf).match()
    topN = TopN(cluster).get()
    return set(topN)
Example #16
0
			static.upload_folder(Sm_Cover_Dir, overwrite=True)
			static.upload_folder(Bg_Cover_Dir, overwrite=True)
			logger.info("update static server success !")

			with NewsDB() as db:
				db.update_table_newsContent(method="rebuild", fromCache=False)
				db.update_table_newsDetail(method="update")'''

			with NewsDB() as db: # 不更新 static 只更新 DB
				db.update_table_newsInfo(method="rebuild", fromCache=False)
				db.update_table_newsContent(method="rebuild", fromCache=False)
				db.update_table_newsDetail(method="update")

			WhooshIdx().create_idx()
			logger.info("update TFIDF ...")
			tfidf = TFIDF().init_for_update()
			tfidf.update()
			logger.info("update TFIDF success !")

		else: # 用于日常更新
			with NewsDB() as db:
				db.update_table_newsInfo(fromCache=False)
				newsIDs = db.get_newsIDs()

			# 更新id后马上先更新静态服务器,避免在更新空档期用户访问图片造成404界面缓存
			logger.info("update static server ...")
			static = StaticManager(newsIDs)
			static.download_covers()
			static.to_jpeg()
			static.cv_compress_sm() # 直接输出即可
			static.cv_compress_bg()
Example #17
0
from tfidf import TFIDF
from match import Match
from topN import TopN
import sys

desc = sys.argv[1]

# online phase step 1
tfidf = TFIDF(desc).gen_vector()

# online phase step 2
cluster = Match(tfidf).match()

# online phase step 3
topN = TopN(cluster).get()
for i in topN:
	print(i)
import pickle

from TrainingWithTFIDF import TFIDFTrainer
tfidfTrainer = TFIDFTrainer()
from FeatureExtractionWithTFIDF import TFIDFPreparer
from tfidf import TFIDF
tfidfInstance = TFIDF()
import nltk
tfidfPreparer = TFIDFPreparer()


class IntentDetector:
    def prepareForNLP(self, text):
        sentences = nltk.sent_tokenize(text)
        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        sentences = [nltk.pos_tag(sent) for sent in sentences]
        return sentences

    def getFilterChunk(self, sentence):
        chunkToExtract = """
            pattern:
           
            {<NNP|NNS|NN><WDT>?<VBP|VBZ>?<JJR>?<IN><CD><CC>?<CD>?}
               """

        parser = nltk.RegexpParser(chunkToExtract)
        result = parser.parse(sentence)

        chunks = []
        for subtree in result.subtrees():
            if subtree.label() == 'pattern':
Example #19
0
def parseQuery(query, invIndex):
    #Both handlers return the respective TF_IDFs
    #docTF_IDF can be run once after crawl
    tempTFIDF = TFIDF()
    queryObj = Query(query)
def start():

    #initialize TFIDF
    tfidf = TFIDF("tfidf_data/name_and_abstracts.txt")
    print("TFIDF initialized")

    #input_file = open("publications.txt")
    input_file = open("pub_min.txt")
    while True:
        '''
            Parse paper title.
            Test for EOF.
        '''
        line = input_file.readline().strip()
        if len(line) == 0:
            break
        assert line[:2] == "#*"
        title = line[2:]
        toks = word_tokenize(title)
        toks = sorted(toks, key=lambda x: tfidf.tf_idf(x), reverse=True)
        print "sorted toks:" + str(toks)
        '''
            Parse author.
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#@"
        authors = line[2:].split(',')
        '''
            Parse Year
        '''
        input_file.readline()
        '''
            Parse Venue
        '''
        line = input_file.readline().strip()
        assert line[:2] == "#c"
        venue = line[2:]
        '''
            Parse paper id.
            Do not cast to integer. Simply unnecessary.
        '''
        line = input_file.readline().strip()
        assert line[:6] == "#index"
        id = line[6:]

        for a in authors:
            dictionary_add_set(author_papers, a, id)
            dictionary_add_set(author_venues, a, venue)
        dictionary_add_set(venue_papers, venue, id)

        paper_venue[id] = venue
        paper_authors[id] = authors
        '''
            Parse citations.
        '''
        line = input_file.readline().strip()
        while line[:2] == "#%":
            '''
                Invalid/empty citation.
            '''
            if len(line) <= 2:
                break
            dictionary_add_set(paper_papers, id, line[2:])
            line = input_file.readline().strip()
        '''
            Read the empty string line so the readline output is not confused with
            EOF.
            Sets the reading pointer to the next paper's title line.
        '''
        line = input_file.readline()
        if line[:2] == "#!":
            input_file.readline()

    return paper_authors, \
           paper_papers, \
           paper_venue, \
           author_papers, \
           venue_papers, \
           author_venues
Example #21
0
if __name__ == '__main__':
    # Get command-line args
    args_ = get_setup_args()

    # Download resources
    download(args_)

    # Import spacy language model
    nlp = spacy.blank("en")

    # Keep all the docs for TF-IDF initilization
    tfidf_docs = []

    # Preprocess dataset
    args_.train_file = url_to_data_path(args_.train_url)
    args_.dev_file = url_to_data_path(args_.dev_url)
    if args_.include_test_examples:
        args_.test_file = url_to_data_path(args_.test_url)
    glove_dir = url_to_data_path(args_.glove_url.replace('.zip', ''))
    glove_ext = '.txt' if glove_dir.endswith('d') else '.{}d.txt'.format(
        args_.glove_dim)
    args_.glove_file = os.path.join(glove_dir,
                                    os.path.basename(glove_dir) + glove_ext)
    pre_process(args_)

    from tfidf import TFIDF
    print(len(tfidf_docs))
    tfidf_scorer = TFIDF(tfidf_docs)
    tfidf_scorer.prepare_data()
    tfidf_scorer.save_to_pickle()
Example #22
0
 def __init__(self):
     self.documents = {}
     self.tfidf = TFIDF()
Example #23
0
def main(args):
    # Load TF-IDF from pickle
    scorer = TFIDF([])
    scorer.get_from_pickle()

    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)['{}_record_file'.format(args.split)]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vocab_size= 1376,
                  hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)
    log.info('Loading checkpoint from {}...'.format(args.load_path))
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Evaluate
    log.info('Evaluating on {} split...'.format(args.split))
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}   # Predictions for submission
    eval_file = vars(args)['{}_eval_file'.format(args.split)]
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict,
                                                      ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    if (args.use_tfidf):
        # Apply TF-IDF filtering to pred_dict
        tf_idf_threshold = 2
        tf_idf_common_threshold = 1
        for key, value in pred_dict.items():
            if value != "":
                tf_idf_score = scorer.normalized_additive_idf_ignore_common_words(
                    value, threshold_frequency=tf_idf_common_threshold)
                if tf_idf_score < tf_idf_threshold:
                    pred_dict[key] = ''
                    pass
                    # print ("pred_dict: {}, pruned".format(tf_idf_score))
                else:
                    pass
                    # print ("pred_dict: {}, kept".format(tf_idf_score))

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg),
                        ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                for k, v in results.items())
        log.info('{} {}'.format(args.split.title(), results_str))

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info('Writing submission file to {}...'.format(sub_path))
    with open(sub_path, 'w') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
Example #24
0
    loaded_model_json = json_file.read()
    json_file.close()
    model = model_from_json(loaded_model_json)

    # load weights into new self.model
    model.load_weights("model/model.h5")
    print("Loaded model from disk")

    sgd = SGD(lr=0.01)

    model.compile(loss='binary_crossentropy', optimizer=sgd)
    return getBinaryResult(model.predict_proba(np.array(x)))


preproses()
td = TFIDF([xdata, ydata])

# TRAINING
# train(td.getOnlyX(), ydata)

# RETRAINING
# retrain_model(td.getOnlyX(), ydata)

# TESTING
test = "ahok itu pemimpin yang beres memimpin"
print test
print testFromTrained([td.transform(test)])

test = "ahok itu pemimpin yang ga beres memimpin"
print test
print testFromTrained([td.transform(test)])
 def calcTFIDF(self):
     t = TFIDF()
     self.tfidf = t.docHandler(self.inverted_index, self.unique_id)
Example #26
0
 def __init__(self):
     self.preparation = Preparation()
     self.tfidf = TFIDF()
Example #27
0
    # read scrap_workbook
    scrap_workbook = read_scrap(args.scrap_file_name)
    #
    ## ES6
    ES6_sheet = scrap_workbook["蔚来ES6"]
    review_container = ReviewContainer(ES6_sheet)
    review_list = review_container.get_review_list()
    doc_word_count_info_list = build_doc_word_count_info_list(review_list)

    ## build model data structure
    term_container = TermContainer(doc_word_count_info_list)
    inverted_file = InvertedFile(term_container, doc_word_count_info_list)

    # build query
    query_list = get_query_list(args.query_expand_workbook_path)
    query_expand_impl = QueryExpandImpl(args.query_expand_workbook_path)
    set_topk_for_query_list(query_list, args.topk)
    apply_query_expand_to_query_list(query_list, query_expand_impl)

    # search
    tfidf_engine = TFIDF(review_container)
    apply_query_search_to_query_list(query_list, inverted_file, tfidf_engine,
                                     review_container)

    # output_workbook
    workbook = Workbook()
    update_workbook_for_query_list(query_list, review_container, workbook)
    workbook.remove(workbook['Sheet'])
    workbook.save(args.output_path)
Example #28
0
 def __init__(self, queryString=""):
     print "Constructing Query Object!"
     self.invIndex = InvertedIndex()
     self.tfidf = TFIDF()
     self.query = queryString
Example #29
0
    def createTrainingSet(self):

        # initialize one class SVM models for each intent
        from sklearn import svm
        windowModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        filterModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        aggregateModel = svm.OneClassSVM(nu=0.01, kernel="linear")
        groupModel = svm.OneClassSVM(nu=0.01, kernel="linear")

        from tfidf import TFIDF
        tfidfInstance = TFIDF()

        documents = []
        fdoc = []
        adoc = []
        wdoc = []
        gdoc = []

        import json
        with open('intents.json') as json_data:
            intentsData = json.load(json_data)
        for intent in intentsData['intents']:
            for pattern in intent['pattern']:
                documents.append(pattern)
                if intent['tag'] == "filter":
                    fdoc.append(pattern)
                if intent['tag'] == "window":
                    wdoc.append(pattern)
                if intent['tag'] == "aggre":
                    adoc.append(pattern)
                if intent['tag'] == "group":
                    gdoc.append(pattern)

        texts = []
        # words relevant to the stream. These words do not help in intent detection and must be removed
        from FeatureExtractionWithTFIDF import TFIDFPreparer
        tfidfPreparer = TFIDFPreparer()
        for doc in documents:
            text = tfidfPreparer.prepareTextForTFIDF(doc)
            texts.append(text)

        self.countVectorizer, self.idf = tfidfInstance.getIDF(documents)

        self.tfidf_filter = tfidfInstance.getTFIDF(fdoc, self.countVectorizer,
                                                   self.idf)
        self.tfidf_aggre = tfidfInstance.getTFIDF(adoc, self.countVectorizer,
                                                  self.idf)
        self.tfidf_window = tfidfInstance.getTFIDF(wdoc, self.countVectorizer,
                                                   self.idf)
        self.tfidf_group = tfidfInstance.getTFIDF(gdoc, self.countVectorizer,
                                                  self.idf)

        x_filter = []
        for i in range(len(fdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_filter[i], self.tfidf_filter)
            x_filter.append([total])
        x_aggre = []
        for i in range(len(adoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_aggre[i], self.tfidf_aggre)
            x_aggre.append([total])
        x_window = []
        for i in range(len(wdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_window[i], self.tfidf_window)
            x_window.append([total])
        x_group = []
        for i in range(len(gdoc)):
            total = tfidfPreparer.getSumOfCosineSimilarity(
                self.tfidf_group[i], self.tfidf_group)
            x_group.append([total])

        filterModel.fit(x_filter)
        windowModel.fit(x_window)
        aggregateModel.fit(x_aggre)
        groupModel.fit(x_group)

        import pickle
        filename = 'finalized_windowModel.sav'
        pickle.dump(windowModel, open(filename, 'wb'))
        filename = 'finalized_filterModel.sav'
        pickle.dump(filterModel, open(filename, 'wb'))
        filename = 'finalized_aggregateModel.sav'
        pickle.dump(aggregateModel, open(filename, 'wb'))
        filename = 'finalized_groupModel.sav'
        pickle.dump(groupModel, open(filename, 'wb'))
Example #30
0
print(wikipedia[3][0])
#return wikipedia

import copy

#def get_corpus():
corpus = [news.text]
titles = [soup.title]
print len(corpus)

for article in wikipedia:
    # if article[0] in topics:
    corpus.append(article[1])
    titles.append(article[0])
print len(corpus)
#return corpus, titles

from tfidf import TFIDF

#def get_sim_docs():
tfi = TFIDF()
tfidf = tfi.get_tfidf(corpus)

sim_docs = []
for index, score in tfi.similar_docs(tfidf, 0, 5):
    sim_docs.append((index, score))
    print score, titles[index]

print "Most relevant document is " + titles[sim_docs[0][0]]
#return sim_docs