예제 #1
0
def PreprocessData():
    # Create an object initialized to None
    pubmedarticlelists = None

    # Create FileOperations object
    fo = FileOperations()

    # parse the xml file
    p = Preprocessing()

    # If parsed file is present then load the file else parse the file
    if fo.exists(GV.parsedDataFile):
        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    else:
        # Call the Parse method
        pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile)

        print(len(pubmedarticlelists))
        print(len(unsavedpmids))

        # Save the parsed data to a file
        fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb')
        fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w')

        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    del fo

    return pubmedarticlelists
예제 #2
0
def InitializeGlossary():

    # Create FileOperation object
    fo = FileOperations()

    # Initialize the two list to None
    glossarylist, synonymlist = [None]*2

    if fo.exists(GV.healthGlossaryFilePath):
        # Load the file from disk
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath)

    else:
        # Get all the glossary terms
        glossarylist, synonymlist = GetGlossaryTerms()

        # Save the glossary terms

        fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb')

        # Save the synonyms
        fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb')

    del fo

    return glossarylist, synonymlist
예제 #3
0
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))
                token_lowercase = [x.lower() for x in tokens]
                tmp.append(token_lowercase)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
예제 #4
0
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
예제 #5
0
def SaveGlossary(glossarylist, synonymlist):
    fo = FileOperations()

    if fo.exists(GV.glossaryFilePath):
        return
    else:
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath)
        synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist)
        synonymterm2 = list((list(term) for term in synonymterm2))
        glossarylist += list(synonymterm2)
        fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb')
    del fo
    def SaveSimilarDocuments(self, pubmedarticlelists, similardocfilename):
        pdocs = self.doc2vec_model.docvecs.doctag_syn0  # [:pts]

        # Get all the pmids
        pmids = self.doc2vec_model.docvecs.offset2doctag  # [:pts]

        # Create the similar documents dictionary for each pmid
        similardocdict = {}
        import pickle
        for idx, pmid in tqdm(enumerate(pmids)):
            # output the top 20 similair documents
            similardocdict[pmid] = self.doc2vec_model.docvecs.most_similar(
                pmid, topn=23752)
            similardocdict[pmid].insert(0, (pmid, '1.0'))

            #TODO New code
            if idx % 1000 == 0 or idx == 23753:
                with open('./saveddata/simdictpmid.pkl',
                          mode='a+b') as f:  # appending, not writing
                    pickle.dump(similardocdict, f)

                similardocdict = {}

            #TODO

        # { 'pmid1': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]},
        #   'pmid2': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]},
        #   ...
        # }

        similararticlesdict = {}
        for idx, pmid in tqdm(enumerate(pmids)):
            # Find current pmid title
            doctitle = pubmedarticlelists[pmid].ArticleTitle

            # Find similar documents pmids
            similardocpmids = similardocdict[pmid]

            similartitlescorelist = []

            # Iterate through all the pmids
            for id, score in similardocpmids:
                articletitle = pubmedarticlelists[id].ArticleTitle
                similartitlescorelist.append([id, articletitle, score])

            similararticlesdict[pmid] = {
                'Title': doctitle,
                'Similar': similartitlescorelist
            }

        # Save the similar documents
        fo = FileOperations()
        fo.SaveFile(similardocfilename, similararticlesdict)
    def CreateTaggedDocuments(self, tokenizeddocs, ids):
        taggeddocuments = None
        fo = FileOperations()

        if fo.exists(GV.taggedDocumentFile):
            taggeddocuments = fo.LoadFile(GV.taggedDocumentFile)
        else:
            taggeddocuments = [
                gensim.models.doc2vec.TaggedDocument(s, [ids[i]])
                for i, s in tqdm(enumerate(tokenizeddocs))
            ]
            fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb')

        del fo

        return taggeddocuments