def Doc2Vec(docs, ids, glossarylist, pubmedarticlelists): # Tokenize all the docs tokenizeddocs = TokenizeDocs(docs, glossarylist, GV.tokenizedDocumentD2VFile) # Create Doc2Vec Model. Changing parameters will change the model name doc2vecmodel = Doc2VecModel(seed=1, num_features = 200, min_word_count = 2, context_size = 3) taggeddocuments = doc2vecmodel.CreateTaggedDocuments(tokenizeddocs, ids) model = doc2vecmodel.Model(taggeddocuments, ids) # Get model filename modelfile = doc2vecmodel.GetModelFileName() #Load the model model = doc2vecmodel.LoadModel(modelfile) # Save Similar Documents doc2vecmodel.SaveSimilarDocuments(pubmedarticlelists, GV.similarDocumentListFile) #Play similardocdict = FileOperations().LoadFile(GV.similarDocumentListFile) print(similardocdict['29794785']['Title']) print('---------------------------------------') for id, title, score in similardocdict['29794785']['Similar']: print(id, ' : ', title) doc2vecmodel.Visualize('29794785')
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def InitializeGlossary(): # Create FileOperation object fo = FileOperations() # Initialize the two list to None glossarylist, synonymlist = [None]*2 if fo.exists(GV.healthGlossaryFilePath): # Load the file from disk glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath) else: # Get all the glossary terms glossarylist, synonymlist = GetGlossaryTerms() # Save the glossary terms fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb') # Save the synonyms fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb') del fo return glossarylist, synonymlist
def __init__(self, parent,app, size, title, style): wx.Frame.__init__(self, parent, id=-1, size=size, title=title, style=style) self.app=app self.Centre() self.initUI() self.fileoperation=FileOperations() self.Show()
def PreprocessData(): # Create an object initialized to None pubmedarticlelists = None # Create FileOperations object fo = FileOperations() # parse the xml file p = Preprocessing() # If parsed file is present then load the file else parse the file if fo.exists(GV.parsedDataFile): pubmedarticlelists = p.LoadFile(GV.parsedDataFile) else: # Call the Parse method pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile) print(len(pubmedarticlelists)) print(len(unsavedpmids)) # Save the parsed data to a file fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb') fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w') pubmedarticlelists = p.LoadFile(GV.parsedDataFile) del fo return pubmedarticlelists
def __init__(self, tempReadFile, tempSaveDirectory, tempSaveFilename, interval): self.temperatureReadFile = tempReadFile self.temperatureSaveFile = os.path.join(tempSaveDirectory,tempSaveFilename) self.sampleInterval = interval self.isRecording = False self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.tempLogger = FileOperations(tempSaveDirectory, tempSaveFilename)
def SaveSimilarDocuments(self, pubmedarticlelists, similardocfilename): pdocs = self.doc2vec_model.docvecs.doctag_syn0 # [:pts] # Get all the pmids pmids = self.doc2vec_model.docvecs.offset2doctag # [:pts] # Create the similar documents dictionary for each pmid similardocdict = {} import pickle for idx, pmid in tqdm(enumerate(pmids)): # output the top 20 similair documents similardocdict[pmid] = self.doc2vec_model.docvecs.most_similar( pmid, topn=23752) similardocdict[pmid].insert(0, (pmid, '1.0')) #TODO New code if idx % 1000 == 0 or idx == 23753: with open('./saveddata/simdictpmid.pkl', mode='a+b') as f: # appending, not writing pickle.dump(similardocdict, f) similardocdict = {} #TODO # { 'pmid1': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # 'pmid2': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # ... # } similararticlesdict = {} for idx, pmid in tqdm(enumerate(pmids)): # Find current pmid title doctitle = pubmedarticlelists[pmid].ArticleTitle # Find similar documents pmids similardocpmids = similardocdict[pmid] similartitlescorelist = [] # Iterate through all the pmids for id, score in similardocpmids: articletitle = pubmedarticlelists[id].ArticleTitle similartitlescorelist.append([id, articletitle, score]) similararticlesdict[pmid] = { 'Title': doctitle, 'Similar': similartitlescorelist } # Save the similar documents fo = FileOperations() fo.SaveFile(similardocfilename, similararticlesdict)
def SaveGlossary(glossarylist, synonymlist): fo = FileOperations() if fo.exists(GV.glossaryFilePath): return else: glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath) synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist) synonymterm2 = list((list(term) for term in synonymterm2)) glossarylist += list(synonymterm2) fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb') del fo
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def __init__(self, distReadFile, distSaveDirectory, distSaveFilename, interval, length): self.distanceReadFile = distReadFile self.distanceSaveFile = os.path.join(distSaveDirectory, distSaveFilename) self.sampleInterval = interval self.isRecording = False self.sampleLength = length self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.distLogger = FileOperations(distSaveDirectory, distSaveFilename)
def CreateTaggedDocuments(self, tokenizeddocs, ids): taggeddocuments = None fo = FileOperations() if fo.exists(GV.taggedDocumentFile): taggeddocuments = fo.LoadFile(GV.taggedDocumentFile) else: taggeddocuments = [ gensim.models.doc2vec.TaggedDocument(s, [ids[i]]) for i, s in tqdm(enumerate(tokenizeddocs)) ] fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb') del fo return taggeddocuments
def tag_text(): file_name = "./data/Restaurants_Train.xml" os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ[ 'STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations(file_name) fo.get_xml() sentences = fo.get_sentences() st = StanfordPOSTagger('english-bidirectional-distsim.tagger') f = open('taged-' + file_name[7:-4] + '.json', 'a') cur = 0 for line in sentences: cur += 1 print cur, cur * 100 / len(sentences), '%' res = st.tag(line.split()) json_tag = json.dumps(res) f.write(json_tag) f.write('\n')
from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.corpus import stopwords import operator import os import re # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("taged.data") tages = fo.get_taged_data() origin = FileOperations("../input.json") origin.get_json() stop = set(stopwords.words('english')) pairs = dict() attributes = dict() regex = re.compile('[^a-zA-Z]') #this for loop is only used for get the attributes of task 2 for line in tages: for tag in line: if tag[1] == 'NN' or tag[1] == 'NNS':
nameOut = (name + "_Constraints") DOPobject.separateChildContent(constraintsDF, "Constraint Type", ret=0, name=nameOut) nameOut = (name + "_Products") DOPobject.processSingleDataset(productDF, nameOut) from FileOperations import FileOperations expPath = root + config["DEFAULT"]["ExportPath"] #f = FileOperations("E:/CUA OpenBank API/OpenBanking/DataProcesing") f = FileOperations(root) i = 0 #for sheet in [3,4,5,6]: sheets = [ 'TRANS_AND_SAVINGS_ACCOUNTS', 'CRED_AND_CHRG_CARDS', 'TERM_DEPOSITS', 'TERM_DEPOSITS_RATES' ] for sheet in sheets: if sheet == 'TERM_DEPOSITS_RATES': rates = SP(dataFile, sheet) rates.log = log rates.path = exportPath + "/" rates.createDict()
if course.theme not in theme_list: theme_list.append(course.theme) return sorted(theme_list) if __name__ == "__main__": from FileOperations import FileOperations data_handler = None print("Sample data\n") input_filename = "data.a" file_handler = FileOperations(input_filename) if (file_handler.status): print(file_handler.data, "\n") data_handler = DataHandler(file_handler.data) data_handler.print_courses_list() from Command import VALID_COMMANDS_REQ print("\nCommand: locations") command = ["locations"] data_handler.process_command(command) print("\nCommand: courses <location> <theme>") print("\nEg: 1")
#def getTimestamp(): # return time.strftime('%Y-%m-%d_%H-%M-%S') #utility method to set up GPIO used by sensors attached to the pi #called at the beginning of __main__ def setupGPIO(): os.system("sudo modprobe w1-therm") os.system("sudo modprobe w1-gpio") GPIO.setmode(GPIO.BCM) GPIO.setup(17, GPIO.IN, GPIO.PUD_UP) #need to set up filepath and filename when we get config in __main__ #used for all classes and threads in this file for logging purposes logger = FileOperations() #Need to setup actual minFreeMB once we get config data in __main__ #used by recording threads to check if there is enough room on the pi to record data storage = Storage() storage.setLogger(logger) #parses data in config file and returns a map of data entries to values def readConfig(): configDict = {} #finding and opening config file #parses config file with built in python config parser local_file_path = os.path.dirname(os.path.realpath(__file__)) + '/' config = ConfigParser.ConfigParser()
def __init__(self): self.fileOperation = FileOperations()
from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.corpus import stopwords import operator import os import re # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("taged.data") tages = fo.get_taged_data() stop = set(stopwords.words('english')) attributes = dict() regex = re.compile('[^a-zA-Z]') for line in tages: for tag in line: if tag[1] == 'NN' or tag[1] == 'NNS': tag[0] = regex.sub('', tag[0]).lower() if tag[0] in stop or len(tag[0]) <= 1: tag[1] = 'STOP' elif tag[0] in attributes: attributes[tag[0]] += 1
import json from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger import os # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("../input.json") fo.get_json() st = StanfordPOSTagger('english-bidirectional-distsim.tagger') f = open('taged.data', 'a') cur = 0 for line in fo.reviews: cur += 1 print cur, cur * 100 / fo.num_lines, '%' res = st.tag(line.split()) json_tag = json.dumps(res) f.write(json_tag) f.write('\n')
def __init__(self): self.message = "" print "===> utils.py initiated" self.fileOperation = FileOperations()
def check_depend_then_ren_and_embed_original_metadata( self, append_faststart=True, artwork=False, copy_chapters=False): """This method will run the "check_dependencies_then_render" method and attempt to embed any artwork from the original file into the output (due to how ffmpeg works, the artwork can't always be copied in one command.)\n if artwork is True it will try to embed artwork from the input into the output specifically. This may happen if ffmpeg tries to output artwork to the first stream of an audio only file.""" # Run standard command to render output. out_file_exists_result = self.check_depend_then_ren( append_faststart=append_faststart) if type(self.in_path) is list: in_meta_file = self.in_path[0] else: in_meta_file = self.in_path # If the output file exists then run the attempt_embed_metadata_silently method. if out_file_exists_result is True: # NOTE: This import is down here to avoid an infinite import. from FileOperations import FileOperations # This will attempt to embed any metadata (mainly for artwork) from the original file into the output. # (Due to how ffmpeg works, the artwork can't always be copied in one command.) # Create temporary output file with the original metadata embedded, delete the original output without the metadata, # and rename this temporary output to the desired output. for out_path in self.out_paths_list: temp_directory_to_embed_metadata = paths.Path().joinpath( out_path.parent, '--temp_dir_to_embed_metadata_silently') paths.Path.mkdir(temp_directory_to_embed_metadata) temp_out_file = paths.Path().joinpath( temp_directory_to_embed_metadata, out_path.stem + out_path.suffix) FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).copy_over_metadata( in_meta_file, copy_chapters) if temp_out_file.exists() is False: if self.print_err is True: print( f'Error, input file to extract metadata silently from "{out_path}" not found.' ) paths.Path(temp_directory_to_embed_metadata).rmdir() else: out_path.unlink() temp_out_file.rename(out_path) if artwork is True: temp_art = FileOperations( in_meta_file, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).extract_artwork() if temp_art is not False: if temp_art.exists(): FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).embed_artwork(temp_art) temp_art.unlink() out_path.unlink() temp_out_file.rename(out_path) temp_directory_to_embed_metadata.rmdir() return True else: # A problem occurred while rendering and no output file was created so quit. return False