def __init__(self, modelpackagepath):
        packageFiles = {
            "m1IndexFile": "m1.index",
            "m2IndexFile": "m2.index",
            "detectorsFile": "Detectors.json",
            "sampleUtterancesFile": "SampleUtterances.json",
            "mappingsFile": "Mappings.json",
            "modelInfo": "ModelInfo.json"
        }
        for key in packageFiles.keys():
            packageFiles[key] = absPath(os.path.join(modelpackagepath, packageFiles[key]))
        self.packageFiles = packageFiles
        self.optionalFiles = ["mappingsFile"]

        if not self.verifyModelFiles():
            raise ModelFileVerificationFailed(fileMissingMessage)

        self.models = {"m1Index": None, "m2Index": None, "detectors": None, "sampleUtterances": None, "mappings": None, "modelInfo": None}
        try:
            with open(self.packageFiles["modelInfo"], "r") as fp:
                self.models["modelInfo"] = ModelInfo(json.loads(fp.read()))
                fp.close()
        except:
            self.models["modelInfo"] = ModelInfo({})
        try:
            self.models["m1Index"] = WmdSimilarity.load(self.packageFiles["m1IndexFile"])
        except:
            raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m1IndexFile"])
        try:
            self.models["m2Index"] = WmdSimilarity.load(self.packageFiles["m2IndexFile"])
            self.models["m2Index"] = None
            del self.models["m2Index"]
        except:
            raise ModelFileLoadFailed("Failed to load index from file " + self.packageFiles["m2IndexFile"])
        try:
            with open(self.packageFiles["detectorsFile"], "r") as f:
                self.models["detectors"] = json.loads(f.read())
                f.close()
        except:
            raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["detectorsFile"])
        if self.models["modelInfo"].detectorContentSplitted:
            try:
                with open(self.packageFiles["mappingsFile"], "r") as f:
                    self.models["mappings"] = json.loads(f.read())
                    f.close()
            except:
                raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["mappingsFile"])
        try:
            with open(self.packageFiles["sampleUtterancesFile"], "r") as f:
                self.models["sampleUtterances"] = json.loads(f.read())
                f.close()
                self.models["sampleUtterances"] = None
                del self.models["sampleUtterances"]
        except:
            raise ModelFileLoadFailed("Failed to parse json from file " + self.packageFiles["sampleUtterancesFile"])
Example #2
0
    def main(self):

        print("Recommendation using Embeddings-Phrases")

        # Loading preprocessed data
        vagas_ti = pd.read_csv(self.dataPrepFile)

        # Loading cvs data
        cvs = pd.read_csv(self.dataCvsFile)

        # Loading bigram and trigrams
        bigram = pickle.load(
            open(self.out + "wordEmbeddings/vagas_cv.bigram", "rb"))
        trigram = pickle.load(
            open(self.out + "wordEmbeddings/vagas_cv.trigram", "rb"))

        # Preprocessing cvs
        cvs = preprocessingCvsPhrases(cvs, bigram, trigram, self.out)

        # Preprocessing job offers
        vagas_skills, vagas_ids = preprocessingJobsPhrases(
            vagas_ti, bigram, trigram, self.out)

        # Loading model
        model_skill_skg = gsm.Word2Vec.load(
            self.out + "wordEmbeddings/ti_skill_phrases_skg.model")
        model_skill_cbow = gsm.Word2Vec.load(
            self.out + "wordEmbeddings/ti_skill_phrases_cbow.model")

        # Using similarity framework for Word Mover's Distance (WMD)
        num_best = 10
        start = time()
        #Normalizing word2vec vectors
        model_skill_skg.init_sims(replace=True)
        instance_skg = WmdSimilarity(vagas_skills,
                                     model_skill_skg,
                                     num_best=num_best)
        print("Time: %.4f" % (time() - start))

        start = time()
        model_skill_cbow.init_sims(replace=True)
        instance_cbow = WmdSimilarity(vagas_skills,
                                      model_skill_cbow,
                                      num_best=num_best)
        print("Time: %.4f" % (time() - start))

        self.recommendation(cvs, vagas_ti, vagas_ids, num_best, instance_skg,
                            "skg")
        self.recommendation(cvs, vagas_ti, vagas_ids, num_best, instance_cbow,
                            "cbow")

        print("Recommendation using Embeddings-Phrases done!")
Example #3
0
def wmd_similarity(lang, docs1, docs2):
    '''
    Input:
        lang: text language-Chinese for 'cn'/ English for 'en'
        docs1:  document strings list1
        docs2: document strings list2
    Output:
        WMD similarity list of docs1 and docs2 pairs
    '''

    # check if the number of documents matched
    assert len(docs1) == len(docs2), 'Documents number is not matched!'
    assert len(docs1) != 0, 'Documents list1 is null'
    assert len(docs2) != 0, 'Documents list2 is null'
    assert lang == 'cn' or lang == 'en', 'Language setting is wrong'

    # change setting according to text language
    if lang == 'cn':
        model_path = '../model/cn.cbow.bin'
        stopwords_path = '../data/chinese_stopwords.txt'
        preprocess_data = preprocess_data_cn
    elif lang == 'en':
        model_path = '../model/GoogleNews-vectors-negative300.bin'
        stopwords_path = '../data/english_stopwords.txt'
        preprocess_data = preprocess_data_en

    # load word2vec model
    LogInfo('Load word2vec model...')
    model = KeyedVectors.load_word2vec_format(model_path,
                                              binary=True,
                                              unicode_errors='ignore')
    # normalize vectors
    model.init_sims(replace=True)

    # preprocess data
    stopwords = [
        w.strip() for w in codecs.open(stopwords_path, 'r',
                                       encoding='utf-8').readlines()
    ]
    sims = []
    LogInfo('Calculating similarity...')
    for i in range(len(docs1)):
        p1 = preprocess_data(stopwords, docs1[i])
        p2 = preprocess_data(stopwords, docs2[i])
        # calculate wmd similarity
        instance = WmdSimilarity(p1, model)
        sim = instance.get_similarities(p2)
        sims.append(sim[0])

    return sims
Example #4
0
def wmd_similarity1(doc1, doc2):
    '''
    Input:
        lang: text language-Chinese for 'cn'/ English for 'en'
        docs1:  document strings list1
        docs2: document strings list2
    Output:
        WMD similarity list of docs1 and docs2 pairs
    '''

    # check if the number of documents matched

    # load word2vec models
    # models = KeyedVectors.load_word2vec_format(model_path, binary=True, unicode_errors='ignore')

    # normalize vectors
    model.init_sims(replace=True)

    # preprocess data
    LogInfo('Calculating similarity...')

    p1 = preprocess_data_cn(stopwords, doc1)
    p2 = preprocess_data_cn(stopwords, doc2)

    # calculate wmd similarity
    p2 = preprocess_data_cn(stopwords, doc2)
    instance = WmdSimilarity([p1], model, num_best=10)
    sim = instance[p2]

    return sim
Example #5
0
    def __init__(self, corpus, model, num_best):

        self.corpus = corpus
        self.num_best = num_best
        self.instance = WmdSimilarity([f.txt_pp for f in self.corpus],
                                      model,
                                      num_best=self.num_best)
Example #6
0
 def similarity(self, query, docs, size=10):
     wmd_inst = WmdSimilarity(docs,
                              self.word2vec,
                              num_best=size,
                              normalize_w2v_and_replace=False)
     sims = wmd_inst[query]
     return sims
Example #7
0
    def word_movers_distance_topn(self):
        model = KeyedVectors.load_word2vec_format(self.embedding_path,
                                                  binary=False)
        tokenized_document_corpus = CorpusDocumentIterator(self.corpus,
                                                           lemma=False,
                                                           lower=False)

        dictionary = corpora.Dictionary()
        bow_corpus = [
            dictionary.doc2bow(doc, allow_update=True)
            for doc in tokenized_document_corpus
        ]
        tf_idf_model = TfidfModel(bow_corpus)

        wmd_corpus = []
        doc_id_mapping = {
            doc_id: i
            for i, doc_id in enumerate(tokenized_document_corpus.doc_ids)
        }
        for doc in tf_idf_model[bow_corpus]:
            tuples = [(dictionary[word_id], sim) for word_id, sim in doc]
            tuples.sort(key=lambda x: x[1])
            tuples = tuples[:self.top_n_words]
            relevant_words = [word for word, sim in tuples]
            wmd_corpus.append(relevant_words)

        similarities = WmdSimilarity(wmd_corpus,
                                     model.wv,
                                     num_best=self.top_n_docs)
        # print(similarities[wmd_corpus[0]])
        # print(similarities[wmd_corpus[doc_id_mapping["cb_0"]]])
        self.similarities = similarities
        self.wmd_corpus = wmd_corpus
        self.doc_id_mapping = doc_id_mapping
        self.reverse_doc_id_mapping = tokenized_document_corpus.doc_ids
def compute_similarity_with_candidate_sentences_using_wmd(query_app_id, steam_tokens=None, model=None,
                                                          candidates=None):
    if steam_tokens is None:
        steam_tokens = load_tokens()

    if model is None:
        model = Word2Vec.load(get_word_model_file_name())

    constrain_search = (candidates is not None)

    query = steam_tokens[query_app_id]

    if constrain_search:
        documents = list(steam_tokens[i] for i in candidates)
    else:
        # Caveat: the Word Mover algorithm is painfully slow! Please consider constraining the search to few candidates!
        documents = list(steam_tokens.values())

    instance = WmdSimilarity(documents, model.wv, num_best=10)

    similarity_scores_as_tuples = instance[query]

    similarity_scores = reformat_similarity_scores_for_wmd(similarity_scores_as_tuples, candidates)
    print_most_similar_sentences(similarity_scores)

    return similarity_scores
Example #9
0
	def getWmdSimilarity(self, jd, resume):

		jd = "We are looking W2 Consultant only. Role Full stack UI Developer Location Mountain View CA Duration 3 Months Payroll team is working to enhance the customer setup and the ability to update employees payroll information to provide cleaner experience for the customers eliminating the need to call customer service. Responsibilities Designs codes tests debug and documents software and enhance existing components to ensure that software meets business needs. Contribute to the design and architecture to enable secure scalable and maintainable software. Apply technical expertise to resolve challenging programming and design problems. Front end UI design and development using JavaScript frameworks and HTML CSS and other Web technologies. Accurately estimate engineering work effort for engineering team. Build high quality code following unit testing and test driven development. Work cross functionally with various extended teams product management designers QA customer support and other business drivers to deliver product features and to address critical customer issues. Skills and Qualifications BS MS in Computer Science or equivalent work experience. 5 years of work experience developing scalable customer facing web and software applications. Strong experience leveraging technologies such as Java J2EE JavaScript HTML5 jQuery and related tools and web frameworks. 3 years of professional experience working with backbone.js or similar JavaScript frameworks is required. Experience in JSP JSF Struts Experience with XML JSON and developing REST services. Good understanding of SQL relational database preferably SQL Server Experienced in Agile software development and Scrum lifecycle practices."

		resume = " OZAN MANAV Istanbul Turkey www.ozanmanav.com +90 551 860 2015 Tranings Volunteer works - Microsoft Student Partner - Google Scholarship - Udemy Pluralsight Javascript Learning Path EDUCATION BACHELOR DEGREE ESKISEHIR OSMANGAZI UNIVERSITY BS in Computer engineering Sep 2012 June 2017 Responsibilities Works - I developed React Native and Native Mobile Applications. - QRcode Supported Membership System Android App Google Play- Elma Cafe Plus - IOT device management app based on logical values Google Play- Rim Control 4 AYTIM GROUP - TURKEY Aytim is the gaming textile company in Turkey providing services in all business lines. JUNIOR SOFTWARE ENGINEER Feb 2014 - Dec 2016 Responsibilities Works - I provided methodologies for object-oriented software development and efficient database design. - We have developed payment systems infrastructure together with the team. - I ve experienced UI testing frameworks like Selenium - I gave trainings to my team about Clean Code and TDD. From Robert C.Martin Books - I ve developed myself for secure software development. BDDK PCI Standards ARENA COMPUTER INC. or Payment Systems - TURKEY Arena is the leading provider of technology products and related supply chain management services in Turkey. Arena is characterised by its high level of innovation professional management and development strategies. Dec 2016 Jan 2018 SOFTWARE ENGINEER Im a Software Engineer familiar with a wide range of programming utilities and languages. Knowledgeable of backend and frontend development requirements. Able to handle any part of the process with ease. Collaborative team player with excellent technical abilities offering 4 years of related experience. JOB EXPERIENCESSOFTWARE ENGINEER Jan 2018 - Current ATP Ata Technology Platform - TURKEY ATP a leader in finance technologies addresses the needs of brokerage firms portfolio managers and insurance companies with comprehensive solutions and services. Its platforms handle a significant portion of the Istanbul Stock Exchanges trading volume. Responsibilities Works - Im supporting frontend mobile and web development and improvement process - Weve developed a dashboard with ReactJS and continuing maintenance with my team friends. - I ve developed react native screens in some parts of Native mobile projects. - In addition I developed mobile applications with React Native for Shiftdelete.net one of Turkey s largest tech news sites. [email protected] GraphQL AWS Docker TDD or BDD Agile or Scrum RESTFul APIs Node Webpack Git HTML5 CSS3 ES6 SOFTWARE ENGINEER Javascript React or React Native Redux CORE SKILLS PROFESSIONAL SUMMARY "

		resume = " Microsoft Word - Shalini Channappa.docx Shalini Channappa Front End Web Developer [email protected] Hard-working web developer with a flair for creating elegant solutions in the least amount of time. Passionate about building responsive websites mobile apps and interactive features that drive business growth and improve UX. Experience Front End Web Developer Cisco 07 or 2018 - present Develop and test new components for the Digital partner advisor DPA project using Cisco UI Angular. Experience in developing single page applications using Angular. Improvise existing components and usability of various areas of the application working closely with a Product manager. Work in an Agile Scrum methodology on fast-moving projects. Extensive experience in UI web applications using HTML5 CSS3 Javascript XML jQuery AJAX JSON Angular and integrating Restful API s. Worked on eliminating bootstrap one of the two UI libraries of the application in order to avoid bloat overwriting and conflicts. Also handled the aftermath of the breakdown of layout and components and stabilized the application with release readiness in one sprint. Upgraded DPA to the current version of Cisco UI which was six versions behind and 90 of the library being overridden by custom definitions. Freelance Web Developer 08 or 2016 - 05 or 2018 Clients Turbo Tax Gabes Rentelo GPA Saver Translated design teams UX wireframes and mockups into responsive interactive features using HTML CSS and JavaScript. Worked with agile team to migrate legacy company website to a Wordpress site. Redesign of Gabes Android mobile app which increased downloads by 18 in less than 6 months. Increased email signups 12 by creating new UI for website landing page in React. Created highly detailed and annotated architectural wireframes. Successfully submitted MVPS. Actively participated in slack channels daily standups UI or UX design process code reviews responsive design managing project using Github s project Kanban board interface documentation testing and the final product launch. Manager Risk Investigations Amazon.com 09 or 2012 - 08 or 2016 Created grease monkey scripts to improve manual investigation efficiency by 115 . Created a script to review investigation steps dynamically and enable mistake proofing to improve investigation quality and reduce decision defect. Conducted a six sigma yellow belt Kaizen event with business operations analytics and software development team to determine and build machine learning model and variable to reduce incoming volume by 45 and saved 7.5 MM. Created dashboard for Amazon.in category management team using ETL jobs. Web Developer Intern Hindustan Aeronautics Limited 01 or 2011 - 05 or 2011 Designed UX wireframes and mockups and translated into interactive features using HTML CSS and JavaScript. Involved in writing stored procedures queries triggers and views. Wrote SQL queries to interact with SQL Server database. Web Developer Intern E Surveying Softtech 08 or 2010 - 12 or 2010 Handled search engine optimization SEO for the company s website resulting in which the website managed to top the Google search in survey related software. Performed Manual Testing on newly launched software technical content writing for upcoming software releases and web content development. EDUCATION Texas A M University 08 or 2016 - 05 or 2018 Master Of Science Computer Science GPA 3.91 SKILLS HTML CSS SQL JavaScript UI or UX Design Angular React Native AWARDS - Above and beyond awards in Q1 and Q3 of 2015 from Amazon.com - Received 6 employee of the month awards from Amazon.com - Awarded as the best Quality auditor during my tenure as quality auditor - Best new hire trainee from a batch of twelve in Amazon.com - Recipient of Grow with Google Developer Challenge Scholarship "

		resume = " Microsoft Word - Raviteja Kondubhatla.docx Ravteja Kondubhatla Data Scientist [email protected] Summary With my 5 years of experience in coding with analytical programming using Python SQL and Hadoop Id like to plan design and implement database solutions and work cross-functionally to customize client needs. My passion is to develop web application back end components and offer support to the front-end developers. Experience Data Scientist Cuna Mutual Group Wisconsin USA Oct 2018 - Present Implemented discretization and binning data wrangling cleaning transforming merging and reshaping data frames using python libraries like Numpy Scikit Matplotlib and Pandas Developed a propensity score generator for targeting the prospective Credit Union members using Machine Learning algorithms using Python Data Analyst Python Development Samsung California USA May 2018-Jul 2018 Automated batch test evaluation that allows a smooth flow of data from distributed data systems to the local machines and involved in Unit testing and Integration testing of the code Created a text normalizer using NLP for Bixby modules and created a workflow using technologies such as GIT Gained experience in working with various Python Integrated Development Environments like IDLE PyCharm Atom Eclipse and Sublime Text Senior Data Analyst Beroe Inc Chennai India May 2012 Dec 2016 Increased revenue by 40 by targeting the most profitable set of customers for a campaign about sustainability by performing a logistic regression technique Designed a product that provides actionable recommendations by identifying best cost sourcing suppliers LCCS for P G by making 95 accurate price forecasts in 2014-15 using elasticity modelling Projects Quantitative Analytics- Credit scoring model for loan applicants - Built a model to identify customers who were likely to default on a loan after extensive data cleaning-missing value and transforming the data outlier treatment . The model used was logistic regression with variables like total transactions purchase volume etc. Predictive Analytics Hospital Ranking - Analyzed hospital data and determined rank of all the hospitals in the United States of America based on the number of patients treated doctor availability and successful operations using python Skills Python SQL Hadoop Education University of Texas at Dallas M.S.in Data Analytics GPA 3.5 Jan 2017- Jul 2018 BITS Pilani B.E. in Engineering GPA 3.5 Aug 2007- May 2012 "

		# jd = 'Python'
		# resume = "java"

		print("\n jd --- ", jd)
		print("\n resume --- ", resume)
		similarity = ''
		try:
			jd_token = self.word_token(self.cleanText(jd))
			jd_token = [tpl[0] for tpl in pos_tag(jd_token) if tpl[1] in ['NN','VB'] ]
			# jd_token = [tpl[0] for tpl in pos_tag(jd_token) if tpl[1] in ['NN', 'NNS','NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] ]
			
			print("\n jd_token = ",jd_token)
			wmd_corpus = self.prepareData(resume)
			instance_wmd = WmdSimilarity(wmd_corpus, self.w2v_model)
			similarity = instance_wmd[jd_token][0]
			print("\n wmd sims --- ", similarity)
		except Exception as e:
			print("\n Error in getWmdSimilarity --- ", e, "\n", traceback.format_exc())
			pass
		return similarity
def discard_non_relevant(jobs_list, stop_words, w2v_model):
    '''
    :param jobs_list: list of jobs where candidate applied in the past. Each item in the list is a dictionary of field names and strings as values
    :stop_words: words that will be discarded from each job to decrease noise
    :w2v_model: gensim library word2vec model used to calculate Word Movers Similarity Metric to find the irrelevant jobs in the list 
    '''
    dist_list = []
    jobs_corpus = []
    imp_fields = ["function", "title", "keywords"]
    for i in range(len(jobs_list)):
        bow1, _, _ = get_bow(jobs_df[jobs_df["id"] == jobs_list[i]],
                             imp_fields,
                             type="pos")
        bow1_stop = [w for w in bow1 if w not in stop_words]
        jobs_corpus.append(bow1_stop)
    instance = WmdSimilarity(jobs_corpus, w2v_model, num_best=None)
    for i in range(len(jobs_list)):
        similarity = instance[jobs_corpus[i]]
        dist_list.extend([(sum(similarity) - 1) / (len(similarity) - 1)])
    new_jobs_list = [
        jobs_list[i] for i in range(len(jobs_list)) if dist_list[i] > 0.45
    ]

    #print(jobs_list)
    #print(dist_list)
    #print(new_jobs_list)
    #print(len(jobs_list),len(new_jobs_list))
    return new_jobs_list
    def calculate_similarity_distance(self, sentence, corpus):
        # length_sentence = len(sentence.split(" "))
        # minimum_length_corpus_sentence = len(min(corpus, key=lambda a: len(a.split(" "))).split(" "))
        #
        # new_corpus = []
        # if minimum_length_corpus_sentence > length_sentence:
        #     for c in corpus:
        #         new_corpus.append(" ".join(c.split(" ")[-length_sentence:]))
        #     new_sentence = " ".join(sentence.split(" ")[-length_sentence:])
        #     corpus = new_corpus
        #     sentence = new_sentence
        # else:
        #     for c in corpus:
        #         new_corpus.append(" ".join(c.split(" ")[-minimum_length_corpus_sentence:]))
        #     new_sentence = " ".join(sentence.split(" ")[-minimum_length_corpus_sentence:])
        #     corpus = new_corpus
        #     sentence = new_sentence

        # if len(corpus) > 10:
        #     similarities = []
        #     similarity_query_response = WmdSimilarity(corpus, self.model, normalize_w2v_and_replace=False,
        #                                               num_best=10)
        #     similarity_query_answer = similarity_query_response[sentence]
        #     for i in range(10):
        #         similarities.append(similarity_query_answer[i][1])
        # else:
        #     similarity_query_response = WmdSimilarity(corpus, self.model, normalize_w2v_and_replace=False)
        #     similarities = similarity_query_response[sentence]

        similarity_query_response = WmdSimilarity(
            corpus, self.model, normalize_w2v_and_replace=False)
        similarities = similarity_query_response[sentence]

        return math_is_great_again.softmax(similarities)
Example #12
0
 def top_K_similar(self, query, corpus, K=10):
     wmd_inst = WmdSimilarity(corpus,
                              self.word2vec,
                              num_best=K,
                              normalize_w2v_and_replace=False)
     scores = wmd_inst[query]
     return scores
Example #13
0
    def score(self, key, tagged_gram, test_file_path, wv_dict, **kwargs):
        # tagged_gram: [['esafetyworld', 'comp'], ['inc', 'end']]
        # find ngrams in test file similar to example
        similar_contexts = []
        example_contexts = self.example_tagged_words_contexts_dict[
            tagged_words_to_str(tagged_gram)]
        test_contexts = list(self.context_sized_test_wv_dict.keys())
        # save_path = os.path.join(self.wmd_save_dir, ft.file_name_from_path(test_file_path))
        # try:
        #     wmd_instance = WmdSimilarity.load(save_path)
        # except FileNotFoundError:
        #     file = open(save_path, 'x')
        #     file.close()
        wmd_instance = WmdSimilarity(test_contexts,
                                     self.context_vec_model,
                                     num_best=1)

        for example_context in example_contexts:
            sims = wmd_instance[example_context]
            similar_contexts.append(test_contexts[sims[0][0]])
        # wmd_instance.save(save_path)
        logging.info('similar contexts:')
        print(similar_contexts)
        # similar_contexts = set()
        context_wv_dict = util.subset_dict_by_list2(wv_dict, similar_contexts)
        logging.info('context_wv_dict:')
        logging.info(len(context_wv_dict))
        # print(context_wv_dict)
        gram = util.sentence_from_tagged_ngram(tagged_gram)
        return OneShotTestDoc2Vec.score(self, key, gram, test_file_path,
                                        context_wv_dict)
Example #14
0
 def init(self):
     if not self.bm25_instance:
         self.bm25_instance = BM25(corpus=self.corpus)
     if not self.wmd_instance:
         from gensim.similarities import WmdSimilarity
         self.embedding_type = EmbType.W2V
         self.load_model()
         self.wmd_instance = WmdSimilarity(corpus=self.corpus, w2v_model=self.model.w2v, num_best=self.num_best)
Example #15
0
def letsquery(testfile, model, corpus, n_output, location, dataframe, n_varmatches, restricted_df, threshold): 
    #testfile is the file whose variables we want to understand, model is the
    #pretrained word2vec or FastText model, corpus is the set of labels or variables we would like to query from,
    #n_varmatches is the number of high-scoring variables to which we limit ourselves for each mystery variable.
    #location is either 'column' or 'row'.
    #depending on whether testfile has variables as columns or rows.
    #dataframe is master dataframe containing file names, labels, and paths.
    #restricted data frame should be a dataframe that only contains variable from econ files, say, if you only
    #want to test those
    
    
    start = time()
    match_files = []
    preintersection = []
    word_list = []
    #pull variables from data set we are investigating
    test_vars = get_vars(testfile, location)
    #print('Attempting to identify the following variables:', test_vars)
    #initiate instance of searching corpus of labels for the test variables
    instance = WmdSimilarity(corpus, model, num_best=n_varmatches)
    
    #find all files that contain the matched variables
    for var in tqdm(test_vars):
        query = split_nonempty(var)
        response = instance[query]
        word_list.append(response[0][0])
        for i in range(n_varmatches):
            if response[i][1] >= threshold:
                slc = restricted_df[restricted_df['spaced_label']  == respace(corpus[response[i][0]])]
                names = slc['path'].unique().tolist()
                match_files += names
            #preintersection.append(set(names))
            #intersection = set.intersection(*preintersection)
    print('I have found ', len(set(match_files)), ' distinct prospective parent files among all variables.')
    #print('I have found ', len(intersection), ' prospective parent files that are common to all variable names.')
    
    #these counters see how often each files appeared in the list of matched files, which
    #includes ALL test variables and ALL prospective match variables for EACH of those test variables
    c = Counter(match_files)
    totals = c.most_common(None)
    mc = totals[:n_output]
    
    #can also split according to which directory the files live in, so that we don't have to look
    #at really long strings
    directories = [directory(x) for x in match_files]
    d = Counter(directories)
    dtotals = d.most_common(None)
    if len(dtotals) >= n_output:
        mcd = dtotals[:n_output]
    else:
        mcd = dtotals
    
    print('The', n_output ,'most common file names are', mc)
    print('The', n_output ,'most common directory names are', mcd)
    print('Total time to process query:', time()-start)
    
    return set(match_files), totals, dtotals, word_list
    def getWmdSimilarity(self, query):

        print "\n query ==>> ", query
        query_token = self.word_token(self.cleanText(query))
        instance_wmd = WmdSimilarity(self.wmd_corpus, self.w2v_data)
        wmd_sims = instance_wmd[query_token]
        wmd_sims = sorted(enumerate(wmd_sims), key=lambda item: -item[1])
        similar_docs = [(s, self.documents[i]) for i, s in wmd_sims]
        similar_docs = similar_docs[:5]
        print "\nsimilar docs => ", similar_docs
Example #17
0
def create_wmd_instances_process(all_knowledge, wmd_model, num_results,
                                 wmd_instance_count, in_q, out_q):
    """
    This function updates the WMD-Instance used in the WMDLogicModule
    NOTE: This runs in a its own processes in order not to block
    return answers faster...
    """
    logger = logging.getLogger(os.path.basename(sys.argv[0]))
    logger.info('   |    +----> CHILD: RE-Creating in separate process...')
    sys.stdout.flush()
    wmd_instances = []
    wmd_corpus = []
    for tokens in all_knowledge:
        wmd_corpus.append(tokens)

    if wmd_instance_count > len(wmd_corpus):
        wmd_instance_count = len(wmd_corpus)
    chunk_size = int(len(wmd_corpus) / wmd_instance_count)
    for i in range(0, wmd_instance_count):
        logger.info(' Instance %d..' % i)
        sys.stdout.flush()
        if i == wmd_instance_count - 1:
            wmd_instance = WmdSimilarity(wmd_corpus[i * chunk_size:],
                                         wmd_model, num_results)
        else:
            wmd_instance = WmdSimilarity(
                wmd_corpus[i * chunk_size:(i + 1) * chunk_size], wmd_model,
                num_results)
        wmd_instances.append(wmd_instance)
    logger.info('   |    +----> CHILD: Adding chunksize to out_q')
    out_q.put(chunk_size)
    logger.info('   |    +----> CHILD: Adding instances to out_q')
    out_q.put(wmd_instances)
    out_q.close()
    logger.info(
        '   |    +----> CHILD: Waiting for data to be flushed to my PARENT...')
    done = in_q.get()
    logger.info(
        '   |    +----> CHILD: I have done my job (Parent is happy), I am going away now...'
    )
    os._exit(0)
    return True
Example #18
0
    def compare(self, a, b):
        """
        Returns the word mover distance between two lists a,b.

        Parameters
        ----------
        a : The first list of strings to be compared to the second list.
        a : The second list of strings to be compared to the first list.
        """
        index = WmdSimilarity([a], self.model)
        return index[b]
    def train(self, lang):
        st = time.time()
        sentences = self.read_data()

        with open("training_jsons/training_sentences_nb_happybytes_faq.json",
                  "w+") as fs:
            fs.write(json.dumps({"sentences": sentences}, indent=4))

        sentences = [self.cleaning_pipeline(sent, lang) for sent in sentences]

        sentences = [word_tokenize(sent) for sent in sentences]

        self.load_word2vec(lang)

        train_time = time.time()
        instance_wmd = WmdSimilarity(sentences, self.model)
        instance_wmd.save("models/fasttext_wmd_nb_happybytes_faq.model")
        del self.model
        print("\n wmd training time --- ", time.time() - train_time)
        print("\n total execution time --- ", time.time() - st)
Example #20
0
def cocomo():
    if request.method == "POST":
        try:
            sentence = request.form['query']
        except:
            errors.append("Can't read!")
            return render_template('cocomo.html', errors=errors)
        if sentence:
            df = pd.read_excel('Evaluation.xlsx')
            df_test = pd.read_excel('Test.xlsx')
            test = df_test['Title'].to_list()
            df['content'] = df[['Title', 'Abstract',
                                'Keywords']].apply(lambda x: ' '.join(x),
                                                   axis=1)
            for x in range(128):
                if df.Title.iloc[x] in test:
                    df.Label.iloc[x] = 'Yes'
                else:
                    df.Label.iloc[x] = 'No'
            cocomo_df = df[df['Label'] == 'Yes']
            cocomo_content = cocomo_df['content'].to_list()
            cocomo_corpus = list()
            cocomo_lemma = list()
            for x in cocomo_content:
                cocomo_corpus.append(preprocess(x))
            for x in cocomo_corpus:
                cocomo_lemma.append(lemmatize(x))
            model_ft = ft(cocomo_lemma,
                          sg=1,
                          workers=3,
                          iter=5,
                          size=100,
                          min_count=5,
                          window=2)
            instance_ft = WmdSimilarity(cocomo_lemma, model_ft, num_best=105)
            query = sentence
            query = preprocess(query)
            query = lemmatize(query)
            sims_ft = instance_ft[query]
            wmd = list()
            tp = 0
            fn = 0
            for i in range(105):
                if round(sims_ft[i][1], 2) >= 0.85:
                    wmd.append(cocomo_df.Title.iloc[sims_ft[i][0]])
                    tp = tp + 1
                else:
                    fn = fn + 1
            df_tp = pd.DataFrame()
            for metadata in wmd:
                df_tp = df_tp.append(df[df['Title'] == metadata])
            df_tp = df_tp.drop(['Abstract', 'Keywords', 'Label', 'content'],
                               axis=1)
    return render_template('cocomo.html', tables=[df_tp.to_html()])
Example #21
0
def getMostSimilarWMD(model, corpus, target, nTop):
    """
    Using the word2vec "model", find in the corpus the nTop most similar
    documents to target.

    NOTE: WmdSimilarity() provide the "negative" of wmdistance(), i.e.:
    sim(d1,d2) = 1/(1+wmdistance(d1,d2)).

    See: https://markroxor.github.io/gensim/static/notebooks/WMD_tutorial.html
    """

    instance = WmdSimilarity(corpus, model, num_best=nTop)
    instance.num_best = nTop
    sims = instance[target]
    #  print('Query:', target)
    #  print("="*80)
    #  for i in range(nTop):
    #      print( 'sim = %.4f' % sims[i][1])
    #      print(corpus[sims[i][0]])
    #      print("="*80)

    return sims
    def main(self):

        print("Recommendation using Embeddings-Words")
        # Reading cvs data
        cvs = pd.read_csv(self.dataCvsFile)
        cvs = preprocessingCvsWords(cvs, self.out)
        #cvs.iloc[0]["id"]*10

        # Reading vagas
        vagas_ti = pd.read_csv(self.dataPrepFile)
        vagas_skills, vagas_ids = preprocessingJobsWords(vagas_ti, self.out)
        num_best = 10

        # Loading model
        model_skill_cbow = gsm.Word2Vec.load(
            self.out + "wordEmbeddings/ti_skill_w2v_cbow_200.model")
        model_skill_cbow.init_sims(replace=True)

        model_skill_skg = gsm.Word2Vec.load(
            self.out + "wordEmbeddings/ti_skill_w2v_skg_200.model")
        model_skill_skg.init_sims(replace=True)

        start = time()
        instance_cbow = WmdSimilarity(
            vagas_skills, model_skill_cbow, num_best=10
        )  # Using similarity framework for Word Mover's Distance (WMD)
        self.recommendation(cvs, vagas_ti, vagas_ids, num_best, instance_cbow,
                            "cbow")
        print("Time: %.4f" % (time() - start))

        start = time()
        instance_skg = WmdSimilarity(
            vagas_skills, model_skill_skg, num_best=10
        )  # Using similarity framework for Word Mover's Distance (WMD)
        self.recommendation(cvs, vagas_ti, vagas_ids, num_best, instance_skg,
                            "skg")
        print("Time: %.4f" % (time() - start))

        print("Recommendation using Embeddings-Words")
Example #23
0
def pm():
    if request.method == "POST":
        try:
            sentence_pm = request.form['query_pm']
        except:
            errors.append("Can't read!")
            return render_template('pm.html', errors=errors)
        if sentence_pm:
            df_pm = pd.read_excel(
                'Process Mining.xlsx',
                names=["Title", "Abstract", "Keywords", "Source", "Label"])
            df_pm['content'] = df_pm[['Title', 'Abstract',
                                      'Keywords']].apply(lambda x: ' '.join(x),
                                                         axis=1)
            pm = df_pm[df_pm['Label'] == 'Yes']
            del pm['Label']
            content_pm = pm['content'].to_list()
            pm_corpus = list()
            pm_lemma = list()
            for x in content_pm:
                pm_corpus.append(preprocess(x))
            for x in pm_corpus:
                pm_lemma.append(lemmatize(x))
            model_ft_pm = ft(pm_lemma,
                             sg=1,
                             workers=3,
                             iter=5,
                             size=100,
                             min_count=5,
                             window=2)
            instance_ft_pm = WmdSimilarity(pm_lemma, model_ft_pm, num_best=105)
            query_pm = sentence_pm
            query_pm = preprocess(query_pm)
            query_pm = lemmatize(query_pm)
            sims_ft_pm = instance_ft_pm[query_pm]
            wmd = list()
            tp = 0
            fn = 0
            for i in range(len(pm_lemma)):
                if round(sims_ft_pm[i][1], 2) >= 0.75:
                    wmd.append(pm.Title.iloc[sims_ft_pm[i][0]])
                    tp = tp + 1
                else:
                    fn = fn + 1
            df_tp = pd.DataFrame()
            for metadata in wmd:
                df_tp = df_tp.append(pm[pm['Title'] == metadata])
            df_tp = df_tp.drop(['Abstract', 'Keywords', 'content'], axis=1)
    return render_template('pm.html', tables=[df_tp.to_html()])
Example #24
0
def ss():
    if request.method == "POST":
        try:
            sentence_ss = request.form['query_ss']
        except:
            errors.append("Can't read!")
            return render_template('ss.html', errors=errors)
        if sentence_ss:
            df_ss = pd.read_excel(
                'Semantic Search.xlsx',
                names=["Title", "Abstract", "Keywords", "Source", "Label"])
            df_ss['content'] = df_ss[['Title', 'Abstract',
                                      'Keywords']].apply(lambda x: ' '.join(x),
                                                         axis=1)
            ss = df_ss[df_ss['Label'] == 'Yes']
            del ss['Label']
            content_ss = ss['content'].to_list()
            ss_corpus = list()
            ss_lemma = list()
            for x in content_ss:
                ss_corpus.append(preprocess(x))
            for x in ss_corpus:
                ss_lemma.append(lemmatize(x))
            model_ft_ss = ft(ss_lemma,
                             sg=1,
                             workers=3,
                             iter=5,
                             size=100,
                             min_count=5,
                             window=2)
            instance_ft_ss = WmdSimilarity(ss_lemma, model_ft_ss, num_best=105)
            query_ss = sentence_ss
            query_ss = preprocess(query_ss)
            query_ss = lemmatize(query_ss)
            sims_ft_ss = instance_ft_ss[query_ss]
            wmd = list()
            tp = 0
            fn = 0
            for i in range(len(ss_lemma)):
                if round(sims_ft_ss[i][1], 2) >= 0.75:
                    wmd.append(ss.Title.iloc[sims_ft_ss[i][0]])
                    tp = tp + 1
                else:
                    fn = fn + 1
            df_tp = pd.DataFrame()
            for metadata in wmd:
                df_tp = df_tp.append(ss[ss['Title'] == metadata])
            df_tp = df_tp.drop(['Abstract', 'Keywords', 'content'], axis=1)
    return render_template('ss.html', tables=[df_tp.to_html()])
Example #25
0
def mr():
    if request.method == "POST":
        try:
            sentence_mr = request.form['query_mr']
        except:
            errors.append("Can't read!")
            return render_template('mr.html', errors=errors)
        if sentence_mr:
            df_mr = pd.read_excel(
                'Mixed Reality.xlsx',
                names=["Title", "Abstract", "Keywords", "Source", "Label"])
            df_mr['content'] = df_mr[['Title', 'Abstract',
                                      'Keywords']].apply(lambda x: ' '.join(x),
                                                         axis=1)
            mr = df_mr[df_mr['Label'] == 'Yes']
            del mr['Label']
            content_mr = mr['content'].to_list()
            mr_corpus = list()
            mr_lemma = list()
            for x in content_mr:
                mr_corpus.append(preprocess(x))
            for x in mr_corpus:
                mr_lemma.append(lemmatize(x))
            model_ft_mr = ft(mr_lemma,
                             sg=1,
                             workers=3,
                             iter=5,
                             size=100,
                             min_count=5,
                             window=2)
            instance_ft_mr = WmdSimilarity(mr_lemma, model_ft_mr, num_best=105)
            query_mr = sentence_mr
            query_mr = preprocess(query_mr)
            query_mr = lemmatize(query_mr)
            sims_ft_mr = instance_ft_mr[query_mr]
            wmd = list()
            tp = 0
            fn = 0
            for i in range(len(mr_lemma)):
                if round(sims_ft_mr[i][1], 2) >= 0.77:
                    wmd.append(mr.Title.iloc[sims_ft_mr[i][0]])
                    tp = tp + 1
                else:
                    fn = fn + 1
            df_tp = pd.DataFrame()
            for metadata in wmd:
                df_tp = df_tp.append(mr[mr['Title'] == metadata])
            df_tp = df_tp.drop(['Abstract', 'Keywords', 'content'], axis=1)
    return render_template('mr.html', tables=[df_tp.to_html()])
Example #26
0
def match():
    model = 'data/m2v.mod'
    model_w2v = Word2Vec.load(model)
    sentences = list(LineSentence('data/cut_word.txt'))
    num_best = len(sentences)
    instance = WmdSimilarity(sentences, model_w2v, num_best=num_best)
    # with open("data/weibo.txt","r",encoding='utf-8') as f:
    #         context = f.readline()
    #         while context:
    #             temp.append(context.split("\t"))
    #             context = f.readline()
    #             # if len(temp) >= 60178:
    #             #     break
    # random.shuffle(temp)
    for item in sentences:
        name_list.append(item[0].replace(":", ""))
        poem_list.append(item[0:])
    conunt = 0
    with open("data/input_data.csv", "a", encoding='utf-8') as f:
        for i in range(num_best):
            if i > num_best: break
            sims = instance[poem_list[i]]
            index1 = name_list[i]
            # poem_dict[(index1,index2)] = 1-sims[j][1]
            sim_name = []
            for j in range(len(sims)):
                sim_name.append(name_list[sims[j][0]])
            for j in range(len(sims)):
                print(sim_name)
                index2 = name_list[j]
                value = sims[sim_name.index(index2)][1]
                value = round(value, 8)
                key_n = (index1, index2)
                key_c = (index2, index1)
                if key_n in poem_match_dict:
                    continue
                poem_match_dict[key_n] = value
                poem_com_dict[key_c] = value
                f.write(str(index1) + "," + str(index2) + "," + str(1 - value))
                f.write("\n")
                conunt += 1
    print(conunt)
    cnt = 0
    for k, v in poem_match_dict.items():
        if v == poem_com_dict[k]:
            cnt += 1
        else:
            print("error occur")
    print(cnt)
Example #27
0
    def train(self):
        np.random.seed(2018)

        self.make_word_list()

        self.model = gensim.models.Word2Vec(self.full_list,
                                            min_count=1,
                                            size=300,
                                            workers=4)

        # normalise vectors
        self.model.init_sims(replace=True)
        self.instance = WmdSimilarity(self.full_list,
                                      self.model,
                                      num_best=self.num_best)
Example #28
0
def wordCbow(sourceTexts, source_id, len_source, targetTexts, target_id,
             len_target, filename):
    """Read source and target artefacts and compute word move distance similarity for each pair of artefacts.
  Args:
    sourceTexts: a list of source artefacts tokenized with stopword removed;
    source_id: a list of source artefacts ids;
    len_source: number of source artefacts
    targetTexts: a list of target artefacts tokenized with stopword removed;
    target_id: a list of target artefacts ids;
    len_target: number of target artefacts
    filename: file where the ir model result are saved.
  Returns:
    None.
    """
    allwords = []
    for i in sourceTexts:
        allwords.append(i)
    for j in targetTexts:
        allwords.append(j)

    if not os.path.exists('helpers/GoogleNews-vectors-negative300.bin.gz'):
        raise ValueError(
            "SKIP: You need to download the google news model and put it on helpers directory"
        )

    model = KeyedVectors.load_word2vec_format(
        'helpers/GoogleNews-vectors-negative300.bin.gz', binary=True)
    instance = WmdSimilarity(sourceTexts, model)

    #creation of the csv file
    with open(filename, 'w') as csvfile:
        writer = csv.DictWriter(csvfile,
                                fieldnames=("Artifact1", "Artifact2",
                                            "probability"))
        writer.writeheader()
        # in each row # add requirements names, model name , and value
        for i in range(0, len_source):
            for j in range(0, len_target):
                print(i)
                sim = model.wmdistance(sourceTexts[i], targetTexts[j])
                writer.writerow({
                    'Artifact1': str("{0}".format(source_id[i])),
                    'Artifact2': str("{0}".format(target_id[j])),
                    'probability': str("{0}".format(sim))
                })

    print("similarity matrix build")
Example #29
0
 def get_wmd_similarity(cls, doc, corpus, limit_number = 30):
     myself = sys._getframe().f_code.co_name
     try:
         tokenize_func = NLPUtil.tokenize_via_jieba
         corpus = map(tokenize_func, corpus)
         corpus_size = len(corpus)
         wmd_inst = WmdSimilarity(corpus, 
                                  cls._word2vec,
                                  num_best = limit_number, 
                                  normalize_w2v_and_replace = False)
         doc_tokens = tokenize_func(doc)
         similar_items = wmd_inst[doc_tokens] if doc_tokens else []
         return similar_items
     except Exception as e:
         logger.get().warn('%s failed, doc=%s, limit_number=%d', 
             doc, limit_number)
         raise 
Example #30
0
    def GetSimilarity(self, sentences, num_best=5):
        """
        :param sentences: 用户输入的问题
        :param num_best: 要获得的相似问题的个数
        :return:
        """
        self.num_best = num_best
        start = time()

        # 初始化生成WmdSimilarity对象,匹配多一条,如果匹配到原问题就去掉
        instance = WmdSimilarity(self.content, self.w2v_model, num_best=num_best+1)
        # 对输入的句子进行分词和去停用词
        split_sent = self.split_word(sentences)
        # 得到匹配结果1
        results_1 = {'title':sentences, 'split_title': split_sent.split()}
        # if self.verbose:
        #     print("result:", result)
        sims = instance[split_sent]  # 形如[(匹配问题编号, 相似度)]
        # 去除匹配到与原问题一样的结果,认为原问题的相似度一定最高,否则模型有问题
        # max_sim_index = -1
        # max_sim = 0
        # for i, sim in enumerate(sims):
        #     if sim[2] > max_sim:
        #         max_sim_index = i
        #         max_sim = sim[1]

        # 相似度最高的放在sims的第0个
        top_sim_num = sims[0][0]  # 相似度最高问题的编号
        self.titles[top_sim_num] = ''.join(self.titles[top_sim_num].split())
        if sentences == self.titles[top_sim_num]:
            sims.remove(sims[0])
        else:
            sims = sims[:-1]
        results_2 = []
        # 得到匹配结果2
        for i, sim in enumerate(sims):
            question_num = sim[0] # 匹配问题的编号
            question = self.titles[question_num]
            each_results_2 = {'index':str(question_num), 'similarity':str(sim[1]), 'title':question, 'confidence':None}
            results_2.append(each_results_2)
        # 汇总结果1和结果2
        results = {'result1': results_1, 'result2': results_2}
        # if self.verbose:
        print('Cell took %.2f seconds to run.' % (time() - start))
        return results