def Doc2Vec(docs, ids, glossarylist, pubmedarticlelists):

    # Tokenize all the docs
    tokenizeddocs = TokenizeDocs(docs, glossarylist, GV.tokenizedDocumentD2VFile)

    # Create Doc2Vec Model. Changing parameters will change the model name
    doc2vecmodel = Doc2VecModel(seed=1, num_features = 200, min_word_count = 2, context_size = 3)
    taggeddocuments = doc2vecmodel.CreateTaggedDocuments(tokenizeddocs, ids)
    model = doc2vecmodel.Model(taggeddocuments, ids)

    # Get model filename
    modelfile = doc2vecmodel.GetModelFileName()

    #Load the model
    model = doc2vecmodel.LoadModel(modelfile)

    # Save Similar Documents
    doc2vecmodel.SaveSimilarDocuments(pubmedarticlelists, GV.similarDocumentListFile)

    #Play
    similardocdict = FileOperations().LoadFile(GV.similarDocumentListFile)
    print(similardocdict['29794785']['Title'])
    print('---------------------------------------')
    for id, title, score in similardocdict['29794785']['Similar']:
        print(id, ' : ', title)

    doc2vecmodel.Visualize('29794785')
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))
                token_lowercase = [x.lower() for x in tokens]
                tmp.append(token_lowercase)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
def InitializeGlossary():

    # Create FileOperation object
    fo = FileOperations()

    # Initialize the two list to None
    glossarylist, synonymlist = [None]*2

    if fo.exists(GV.healthGlossaryFilePath):
        # Load the file from disk
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath)

    else:
        # Get all the glossary terms
        glossarylist, synonymlist = GetGlossaryTerms()

        # Save the glossary terms

        fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb')

        # Save the synonyms
        fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb')

    del fo

    return glossarylist, synonymlist
 def __init__(self, parent,app, size, title, style):
     wx.Frame.__init__(self, parent, id=-1, size=size, title=title, style=style)
     self.app=app
     self.Centre()
     self.initUI()
     self.fileoperation=FileOperations()
     self.Show()
def PreprocessData():
    # Create an object initialized to None
    pubmedarticlelists = None

    # Create FileOperations object
    fo = FileOperations()

    # parse the xml file
    p = Preprocessing()

    # If parsed file is present then load the file else parse the file
    if fo.exists(GV.parsedDataFile):
        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    else:
        # Call the Parse method
        pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile)

        print(len(pubmedarticlelists))
        print(len(unsavedpmids))

        # Save the parsed data to a file
        fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb')
        fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w')

        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    del fo

    return pubmedarticlelists
Esempio n. 6
0
	def __init__(self, tempReadFile, tempSaveDirectory, tempSaveFilename, interval):
		self.temperatureReadFile = tempReadFile
		self.temperatureSaveFile = os.path.join(tempSaveDirectory,tempSaveFilename)
		self.sampleInterval = interval
		self.isRecording = False
		self.recordingLoopActive = False
		self.threadsStarted = False
		self.nextTimer = None
		self.tempLogger = FileOperations(tempSaveDirectory, tempSaveFilename)
    def SaveSimilarDocuments(self, pubmedarticlelists, similardocfilename):
        pdocs = self.doc2vec_model.docvecs.doctag_syn0  # [:pts]

        # Get all the pmids
        pmids = self.doc2vec_model.docvecs.offset2doctag  # [:pts]

        # Create the similar documents dictionary for each pmid
        similardocdict = {}
        import pickle
        for idx, pmid in tqdm(enumerate(pmids)):
            # output the top 20 similair documents
            similardocdict[pmid] = self.doc2vec_model.docvecs.most_similar(
                pmid, topn=23752)
            similardocdict[pmid].insert(0, (pmid, '1.0'))

            #TODO New code
            if idx % 1000 == 0 or idx == 23753:
                with open('./saveddata/simdictpmid.pkl',
                          mode='a+b') as f:  # appending, not writing
                    pickle.dump(similardocdict, f)

                similardocdict = {}

            #TODO

        # { 'pmid1': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]},
        #   'pmid2': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]},
        #   ...
        # }

        similararticlesdict = {}
        for idx, pmid in tqdm(enumerate(pmids)):
            # Find current pmid title
            doctitle = pubmedarticlelists[pmid].ArticleTitle

            # Find similar documents pmids
            similardocpmids = similardocdict[pmid]

            similartitlescorelist = []

            # Iterate through all the pmids
            for id, score in similardocpmids:
                articletitle = pubmedarticlelists[id].ArticleTitle
                similartitlescorelist.append([id, articletitle, score])

            similararticlesdict[pmid] = {
                'Title': doctitle,
                'Similar': similartitlescorelist
            }

        # Save the similar documents
        fo = FileOperations()
        fo.SaveFile(similardocfilename, similararticlesdict)
def SaveGlossary(glossarylist, synonymlist):
    fo = FileOperations()

    if fo.exists(GV.glossaryFilePath):
        return
    else:
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath)
        synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist)
        synonymterm2 = list((list(term) for term in synonymterm2))
        glossarylist += list(synonymterm2)
        fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb')
    del fo
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
Esempio n. 10
0
 def __init__(self, distReadFile, distSaveDirectory, distSaveFilename,
              interval, length):
     self.distanceReadFile = distReadFile
     self.distanceSaveFile = os.path.join(distSaveDirectory,
                                          distSaveFilename)
     self.sampleInterval = interval
     self.isRecording = False
     self.sampleLength = length
     self.recordingLoopActive = False
     self.threadsStarted = False
     self.nextTimer = None
     self.distLogger = FileOperations(distSaveDirectory, distSaveFilename)
    def CreateTaggedDocuments(self, tokenizeddocs, ids):
        taggeddocuments = None
        fo = FileOperations()

        if fo.exists(GV.taggedDocumentFile):
            taggeddocuments = fo.LoadFile(GV.taggedDocumentFile)
        else:
            taggeddocuments = [
                gensim.models.doc2vec.TaggedDocument(s, [ids[i]])
                for i, s in tqdm(enumerate(tokenizeddocs))
            ]
            fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb')

        del fo

        return taggeddocuments
Esempio n. 12
0
def tag_text():
    file_name = "./data/Restaurants_Train.xml"

    os.environ[
        'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
    os.environ[
        'STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

    fo = FileOperations(file_name)
    fo.get_xml()
    sentences = fo.get_sentences()

    st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
    f = open('taged-' + file_name[7:-4] + '.json', 'a')
    cur = 0

    for line in sentences:
        cur += 1
        print cur, cur * 100 / len(sentences), '%'
        res = st.tag(line.split())
        json_tag = json.dumps(res)
        f.write(json_tag)
        f.write('\n')
Esempio n. 13
0
from FileOperations import FileOperations
import nltk
from nltk.tag.stanford import StanfordPOSTagger
from nltk.corpus import stopwords
import operator
import os
import re

# set the java environment variables:
# CLASSPATH is the path to the stanford-postagger.jar in your local disk
# STANFORD_MODELS is the path to the tagger file in your local disk
os.environ[
    'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

fo = FileOperations("taged.data")
tages = fo.get_taged_data()

origin = FileOperations("../input.json")
origin.get_json()

stop = set(stopwords.words('english'))

pairs = dict()
attributes = dict()
regex = re.compile('[^a-zA-Z]')

#this for loop is only used for get the attributes of task 2
for line in tages:
    for tag in line:
        if tag[1] == 'NN' or tag[1] == 'NNS':
Esempio n. 14
0
    nameOut = (name + "_Constraints")
    DOPobject.separateChildContent(constraintsDF,
                                   "Constraint Type",
                                   ret=0,
                                   name=nameOut)

    nameOut = (name + "_Products")
    DOPobject.processSingleDataset(productDF, nameOut)


from FileOperations import FileOperations
expPath = root + config["DEFAULT"]["ExportPath"]

#f = FileOperations("E:/CUA OpenBank API/OpenBanking/DataProcesing")
f = FileOperations(root)
i = 0

#for sheet in [3,4,5,6]:
sheets = [
    'TRANS_AND_SAVINGS_ACCOUNTS', 'CRED_AND_CHRG_CARDS', 'TERM_DEPOSITS',
    'TERM_DEPOSITS_RATES'
]

for sheet in sheets:
    if sheet == 'TERM_DEPOSITS_RATES':
        rates = SP(dataFile, sheet)
        rates.log = log
        rates.path = exportPath + "/"
        rates.createDict()
Esempio n. 15
0
            if course.theme not in theme_list:
                theme_list.append(course.theme)

        return sorted(theme_list)


if __name__ == "__main__":

    from FileOperations import FileOperations

    data_handler = None

    print("Sample data\n")
    input_filename = "data.a"

    file_handler = FileOperations(input_filename)

    if (file_handler.status):
        print(file_handler.data, "\n")
        data_handler = DataHandler(file_handler.data)

        data_handler.print_courses_list()

    from Command import VALID_COMMANDS_REQ

    print("\nCommand: locations")
    command = ["locations"]
    data_handler.process_command(command)

    print("\nCommand: courses <location> <theme>")
    print("\nEg: 1")
Esempio n. 16
0
#def getTimestamp():
#	return time.strftime('%Y-%m-%d_%H-%M-%S')


#utility method to set up GPIO used by sensors attached to the pi
#called at the beginning of __main__
def setupGPIO():
    os.system("sudo modprobe w1-therm")
    os.system("sudo modprobe w1-gpio")
    GPIO.setmode(GPIO.BCM)
    GPIO.setup(17, GPIO.IN, GPIO.PUD_UP)


#need to set up filepath and filename when we get config in __main__
#used for all classes and threads in this file for logging purposes
logger = FileOperations()

#Need to setup actual minFreeMB once we get config data in __main__
#used by recording threads to check if there is enough room on the pi to record data
storage = Storage()
storage.setLogger(logger)


#parses data in config file and returns a map of data entries to values
def readConfig():
    configDict = {}

    #finding and opening config file
    #parses config file with built in python config parser
    local_file_path = os.path.dirname(os.path.realpath(__file__)) + '/'
    config = ConfigParser.ConfigParser()
 def __init__(self):
     self.fileOperation = FileOperations()
Esempio n. 18
0
from FileOperations import FileOperations
import nltk
from nltk.tag.stanford import StanfordPOSTagger
from nltk.corpus import stopwords
import operator
import os
import re

# set the java environment variables:
# CLASSPATH is the path to the stanford-postagger.jar in your local disk
# STANFORD_MODELS is the path to the tagger file in your local disk
os.environ[
    'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

fo = FileOperations("taged.data")
tages = fo.get_taged_data()

stop = set(stopwords.words('english'))

attributes = dict()
regex = re.compile('[^a-zA-Z]')

for line in tages:
    for tag in line:
        if tag[1] == 'NN' or tag[1] == 'NNS':
            tag[0] = regex.sub('', tag[0]).lower()
            if tag[0] in stop or len(tag[0]) <= 1:
                tag[1] = 'STOP'
            elif tag[0] in attributes:
                attributes[tag[0]] += 1
Esempio n. 19
0
import json
from FileOperations import FileOperations
import nltk
from nltk.tag.stanford import StanfordPOSTagger
import os

# set the java environment variables:
# CLASSPATH is the path to the stanford-postagger.jar in your local disk
# STANFORD_MODELS is the path to the tagger file in your local disk
os.environ[
    'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar'
os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger'

fo = FileOperations("../input.json")
fo.get_json()
st = StanfordPOSTagger('english-bidirectional-distsim.tagger')
f = open('taged.data', 'a')
cur = 0
for line in fo.reviews:
    cur += 1
    print cur, cur * 100 / fo.num_lines, '%'
    res = st.tag(line.split())
    json_tag = json.dumps(res)
    f.write(json_tag)
    f.write('\n')
Esempio n. 20
0
 def __init__(self):
     self.message = ""
     print "===> utils.py initiated"
     self.fileOperation = FileOperations()
Esempio n. 21
0
    def check_depend_then_ren_and_embed_original_metadata(
            self, append_faststart=True, artwork=False, copy_chapters=False):
        """This method will run the "check_dependencies_then_render" method and attempt to embed any artwork from the
		original file into the output (due to how ffmpeg works, the artwork can't always be copied in one command.)\n
		if artwork is True it will try to embed artwork from the input into the output specifically.
		This may happen if ffmpeg tries to output artwork to the first stream of an audio only file."""

        # Run standard command to render output.
        out_file_exists_result = self.check_depend_then_ren(
            append_faststart=append_faststart)

        if type(self.in_path) is list:
            in_meta_file = self.in_path[0]
        else:
            in_meta_file = self.in_path

        # If the output file exists then run the attempt_embed_metadata_silently method.
        if out_file_exists_result is True:
            # NOTE: This import is down here to avoid an infinite import.
            from FileOperations import FileOperations
            # This will attempt to embed any metadata (mainly for artwork) from the original file into the output.
            # (Due to how ffmpeg works, the artwork can't always be copied in one command.)
            # Create temporary output file with the original metadata embedded, delete the original output without the metadata,
            # and rename this temporary output to the desired output.
            for out_path in self.out_paths_list:
                temp_directory_to_embed_metadata = paths.Path().joinpath(
                    out_path.parent, '--temp_dir_to_embed_metadata_silently')
                paths.Path.mkdir(temp_directory_to_embed_metadata)
                temp_out_file = paths.Path().joinpath(
                    temp_directory_to_embed_metadata,
                    out_path.stem + out_path.suffix)
                FileOperations(out_path, temp_directory_to_embed_metadata,
                               False, self.print_ren_info, False,
                               False).copy_over_metadata(
                                   in_meta_file, copy_chapters)
                if temp_out_file.exists() is False:
                    if self.print_err is True:
                        print(
                            f'Error, input file to extract metadata silently from "{out_path}" not found.'
                        )
                    paths.Path(temp_directory_to_embed_metadata).rmdir()
                else:
                    out_path.unlink()
                    temp_out_file.rename(out_path)
                if artwork is True:
                    temp_art = FileOperations(
                        in_meta_file, temp_directory_to_embed_metadata, False,
                        self.print_ren_info, False, False).extract_artwork()
                    if temp_art is not False:
                        if temp_art.exists():
                            FileOperations(out_path,
                                           temp_directory_to_embed_metadata,
                                           False, self.print_ren_info, False,
                                           False).embed_artwork(temp_art)
                            temp_art.unlink()
                            out_path.unlink()
                            temp_out_file.rename(out_path)
                temp_directory_to_embed_metadata.rmdir()
                return True

        else:
            # A problem occurred while rendering and no output file was created so quit.
            return False