def crawl(path_rules, search_rules=None, result_rules=None, output=None): ''' crawl directories starting at ```path_rules["start"]``` until ```path_rules["max_depth"]``` depth reached. check validity ```path_rules``` on directory and ```path_rules["file"]``` on file then execute ```search_rules``` if valid :param path_rules(dict): regex rules on file paths to exclude or include files/directory in the crawler :param search_rules(Optional[dict]): regex rules to search pattern in files :param result_rules(Optional[dict]): regex rules to extract data from files matched :return (dict of str: dict): a dictionary with path of files being keys and values are the results of search_rules on said file ''' result_by_file = {} if path_rules is None or "start" not in path_rules: return result_by_file root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1 for dir_path, subdirList, fileList in os.walk(path_rules["start"]): current_depth = dir_path.count(os.path.sep) - root_depth if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth: for fname in fileList: full_path = os.path.join(dir_path, fname) if os.path.isfile(full_path) \ and ("file" not in path_rules or \ FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))): result_by_file[full_path] = FO.validate_file(full_path, search_rules, result_rules) for subdir in subdirList: subdir_full_path = os.path.join(dir_path, subdir) if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False: subdirList.remove(subdir) if output is not None: Crawler.save_crawler_data(result_by_file, output) return result_by_file
def crawl_multithread(path_rules, search_rules=None, result_rules=None, output=None, threads=None): '''This method is the multithreading version of :func:`Crawler.crawl` with tentative of 1 thread per file''' result_by_file = {} if path_rules is None or "start" not in path_rules: return result_by_file futures = {} with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor: root_depth = path_rules["start"].rstrip(os.path.sep).count(os.path.sep) - 1 for dir_path, subdirList, fileList in os.walk(path_rules["start"]): current_depth = dir_path.count(os.path.sep) - root_depth if "max_depth" not in path_rules or path_rules["max_depth"] >= current_depth: for fname in fileList: full_path = os.path.join(dir_path, fname) if os.path.isfile(full_path) \ and ("file" not in path_rules or \ FO.validate_string(full_path, path_rules["file"].get("include"), path_rules["file"].get("exclude"))): futures[executor.submit(FO.validate_file,full_path, search_rules, result_rules)] = full_path for subdir in subdirList: subdir_full_path = os.path.join(dir_path, subdir) if FO.validate_string(subdir_full_path, path_rules.get("include"), path_rules.get("exclude")) is False: subdirList.remove(subdir) for future in concurrent.futures.as_completed(futures): file_result = futures[future] try: result_by_file[file_result] = future.result() except Exception as exc: logging.debug('%r generated an exception: %s',file_result, exc) if output is not None: Crawler.save_crawler_data(result_by_file, output) return result_by_file
def PreprocessData(): # Create an object initialized to None pubmedarticlelists = None # Create FileOperations object fo = FileOperations() # parse the xml file p = Preprocessing() # If parsed file is present then load the file else parse the file if fo.exists(GV.parsedDataFile): pubmedarticlelists = p.LoadFile(GV.parsedDataFile) else: # Call the Parse method pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile) print(len(pubmedarticlelists)) print(len(unsavedpmids)) # Save the parsed data to a file fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb') fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w') pubmedarticlelists = p.LoadFile(GV.parsedDataFile) del fo return pubmedarticlelists
def test_search_string(self): f = open("./test/test_inputs/test_search_string.txt", "r") content = f.read() f.close() self.assertTrue(FileOperations.validate_string(content, ["tes"])) self.assertTrue(FileOperations.validate_string(content, ["Google", "Analytics"])) self.assertFalse(FileOperations.validate_string(content, ["Google", "Analytecs"]))
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def test_crawl_native_fakeCrawler(self): parameter_file = "./test/search_parameters.json" c = Crawler("SimpleTest", parameters=FileOperations.get_from_JSON_file(parameter_file)) self.assertEqual(c.name, "SimpleTest") c.crawl_native() self.assertTrue(os.path.isfile(parameter_file)) result_from_file = FileOperations.get_from_JSON_file(c.output["path"]) self.assertEqual(len(result_from_file), 3)
def __init__(self, tempReadFile, tempSaveDirectory, tempSaveFilename, interval): self.temperatureReadFile = tempReadFile self.temperatureSaveFile = os.path.join(tempSaveDirectory,tempSaveFilename) self.sampleInterval = interval self.isRecording = False self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.tempLogger = FileOperations(tempSaveDirectory, tempSaveFilename)
def __init__(self, distReadFile, distSaveDirectory, distSaveFilename, interval, length): self.distanceReadFile = distReadFile self.distanceSaveFile = os.path.join(distSaveDirectory, distSaveFilename) self.sampleInterval = interval self.isRecording = False self.sampleLength = length self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.distLogger = FileOperations(distSaveDirectory, distSaveFilename)
def SaveSimilarDocuments(self, pubmedarticlelists, similardocfilename): pdocs = self.doc2vec_model.docvecs.doctag_syn0 # [:pts] # Get all the pmids pmids = self.doc2vec_model.docvecs.offset2doctag # [:pts] # Create the similar documents dictionary for each pmid similardocdict = {} import pickle for idx, pmid in tqdm(enumerate(pmids)): # output the top 20 similair documents similardocdict[pmid] = self.doc2vec_model.docvecs.most_similar( pmid, topn=23752) similardocdict[pmid].insert(0, (pmid, '1.0')) #TODO New code if idx % 1000 == 0 or idx == 23753: with open('./saveddata/simdictpmid.pkl', mode='a+b') as f: # appending, not writing pickle.dump(similardocdict, f) similardocdict = {} #TODO # { 'pmid1': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # 'pmid2': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # ... # } similararticlesdict = {} for idx, pmid in tqdm(enumerate(pmids)): # Find current pmid title doctitle = pubmedarticlelists[pmid].ArticleTitle # Find similar documents pmids similardocpmids = similardocdict[pmid] similartitlescorelist = [] # Iterate through all the pmids for id, score in similardocpmids: articletitle = pubmedarticlelists[id].ArticleTitle similartitlescorelist.append([id, articletitle, score]) similararticlesdict[pmid] = { 'Title': doctitle, 'Similar': similartitlescorelist } # Save the similar documents fo = FileOperations() fo.SaveFile(similardocfilename, similararticlesdict)
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def __init__(self, turns, sheep_count, sheep_speed, wolf_speed, limit): self.turns = turns self.sheep_count = sheep_count self.sheep_speed = sheep_speed self.wolf_speed = wolf_speed self.limit = limit self.sheep_list = [] self.wolf = Wolf(wolf_speed, self.sheep_list) self.turn = 1 self.list_with_dictionaries = [] FileOperations.create_csv()
def __init__(self, parent,app, size, title, style): wx.Frame.__init__(self, parent, id=-1, size=size, title=title, style=style) self.app=app self.Centre() self.initUI() self.fileoperation=FileOperations() self.Show()
def CreateTaggedDocuments(self, tokenizeddocs, ids): taggeddocuments = None fo = FileOperations() if fo.exists(GV.taggedDocumentFile): taggeddocuments = fo.LoadFile(GV.taggedDocumentFile) else: taggeddocuments = [ gensim.models.doc2vec.TaggedDocument(s, [ids[i]]) for i, s in tqdm(enumerate(tokenizeddocs)) ] fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb') del fo return taggeddocuments
def Doc2Vec(docs, ids, glossarylist, pubmedarticlelists): # Tokenize all the docs tokenizeddocs = TokenizeDocs(docs, glossarylist, GV.tokenizedDocumentD2VFile) # Create Doc2Vec Model. Changing parameters will change the model name doc2vecmodel = Doc2VecModel(seed=1, num_features = 200, min_word_count = 2, context_size = 3) taggeddocuments = doc2vecmodel.CreateTaggedDocuments(tokenizeddocs, ids) model = doc2vecmodel.Model(taggeddocuments, ids) # Get model filename modelfile = doc2vecmodel.GetModelFileName() #Load the model model = doc2vecmodel.LoadModel(modelfile) # Save Similar Documents doc2vecmodel.SaveSimilarDocuments(pubmedarticlelists, GV.similarDocumentListFile) #Play similardocdict = FileOperations().LoadFile(GV.similarDocumentListFile) print(similardocdict['29794785']['Title']) print('---------------------------------------') for id, title, score in similardocdict['29794785']['Similar']: print(id, ' : ', title) doc2vecmodel.Visualize('29794785')
def test_crawl_multithread_mmcoreAsync(self): parameter_data = FileOperations.get_from_JSON_file("./test/search_async.json") crawlers = parameter_data["crawlers"] crawlerName = "dotAsync" c = Crawler(crawlerName, parameters=crawlers[crawlerName]) data = c.crawl_native(threads=None) self.assertTrue(len(data) > 0) c.save_crawler_data(data, crawlers[crawlerName]["output"])
def menu(self): try: while True: opt = input("command: ").strip() if opt == "cl": os.system("clear") continue self.client.send(opt.encode("utf-8")) if opt == "e" or opt == "exit": break if opt == "dl" or opt == "ul": filename = input("File to {}, leave blank to skip: ".format(opt)).strip() if opt == "dl": if not filename: self.client.send(" ".encode("utf-8")) continue self.client.send(filename.encode("utf-8")) file_exists = self.client.recv(self.segement_size).decode("utf-8") if file_exists == "1": FileOps.recieve_file("{}/{}".format(self.SAVETO, filename), self.client, self.segement_size) print("[+] File downloaded successfully") else: print(file_exists) continue elif opt == "ul": if not filename: self.client.send(" ".encode("utf-8")) continue if FileOps.file_exists(filename): self.client.send(filename.encode("utf-8")) FileOps.send_file(filename, self.client, self.segement_size) print("[+] File uploaded successfully") else: self.client.send(" ".encode("utf-8")) print("[-] Unable to upload as the file could not be found: {}".format(filename)) else: reply = self.client.recv(self.segement_size).decode("utf-8") print(reply) except OSError as err: print("[-] The client or server has caused the following error to occur:\n {}".format(err)) except KeyboardInterrupt: pass finally: self.client.close() print("[*] Client closed.") sys.exit(0)
def test_crawl_clientIntegrations(self): parameter_data = FileOperations.get_from_JSON_file("./test/search_integration.json") crawlers = parameter_data["crawlers"] crawlerName = "Integration" c = Crawler(crawlerName, parameters=crawlers[crawlerName]) data = c.crawl_native() self.assertTrue(len(data) > 0) c.save_crawler_data(data, crawlers[crawlerName]["output"])
def InitializeGlossary(): # Create FileOperation object fo = FileOperations() # Initialize the two list to None glossarylist, synonymlist = [None]*2 if fo.exists(GV.healthGlossaryFilePath): # Load the file from disk glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath) else: # Get all the glossary terms glossarylist, synonymlist = GetGlossaryTerms() # Save the glossary terms fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb') # Save the synonyms fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb') del fo return glossarylist, synonymlist
def tag_text(): file_name = "./data/Restaurants_Train.xml" os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ[ 'STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations(file_name) fo.get_xml() sentences = fo.get_sentences() st = StanfordPOSTagger('english-bidirectional-distsim.tagger') f = open('taged-' + file_name[7:-4] + '.json', 'a') cur = 0 for line in sentences: cur += 1 print cur, cur * 100 / len(sentences), '%' res = st.tag(line.split()) json_tag = json.dumps(res) f.write(json_tag) f.write('\n')
def SaveGlossary(glossarylist, synonymlist): fo = FileOperations() if fo.exists(GV.glossaryFilePath): return else: glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath) synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist) synonymterm2 = list((list(term) for term in synonymterm2)) glossarylist += list(synonymterm2) fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb') del fo
def simulate(self): for i in range(0, self.sheep_count): self.sheep_list.append(Sheep(i, self.sheep_speed, self.limit)) while self.turn <= self.turns and self.sheep_left_check(): for sheep in self.sheep_list: sheep.update() self.wolf.update() self.display() FileOperations.append_to_csv([self.turn, self.count_alive_sheep()]) FileOperations.append_dictionary_to_list( self.list_with_dictionaries, self.create_dictionary()) self.turn += 1 FileOperations.create_json(self.list_with_dictionaries)
#def getTimestamp(): # return time.strftime('%Y-%m-%d_%H-%M-%S') #utility method to set up GPIO used by sensors attached to the pi #called at the beginning of __main__ def setupGPIO(): os.system("sudo modprobe w1-therm") os.system("sudo modprobe w1-gpio") GPIO.setmode(GPIO.BCM) GPIO.setup(17, GPIO.IN, GPIO.PUD_UP) #need to set up filepath and filename when we get config in __main__ #used for all classes and threads in this file for logging purposes logger = FileOperations() #Need to setup actual minFreeMB once we get config data in __main__ #used by recording threads to check if there is enough room on the pi to record data storage = Storage() storage.setLogger(logger) #parses data in config file and returns a map of data entries to values def readConfig(): configDict = {} #finding and opening config file #parses config file with built in python config parser local_file_path = os.path.dirname(os.path.realpath(__file__)) + '/' config = ConfigParser.ConfigParser() config.readfp(open(local_file_path + 'config'))
class utils: messageList = [] chatMsgList = [] DPPojoList = [] WPPojoList = [] errorMessage = "" def __init__(self): self.message = "" print "===> utils.py initiated" self.fileOperation = FileOperations() def write(self, fileLocation, text): self.fileOperation.write(fileLocation, text) def loadChatMsgFromFile(self, chatFileLocation): # loading chat lines = self.fileOperation.read(chatFileLocation) for line in lines: self.chatMsgList.append(line) #loads message.txt def loadMessages(self, msgFileName): lines = self.fileOperation.read(msgFileName) isAMessage = False message = "" lineNumber = 0 for eachLine in lines: lineNumber += 1 if eachLine.strip() == "" or eachLine.strip()[0] == '#': continue elif eachLine.strip().startswith(INSTRUCTION_START): if isAMessage: d = PBI.PyBusyInfo("Your message in file ======> " + msgFileName + " is missing message end " '</message>' " /n") wx.Yield() time.sleep(3) del d quit() isAMessage = True msgPojo = messagePojo() id_ = self.getIdFromString(eachLine) if id_.isdigit() and len(id_) == 6: msgPojo.setID(id_) else: d = PBI.PyBusyInfo("Your message in file ======> " + msgFileName + " <======= lineNumber= " + lineNumber + " given id " + id_ + " is not a valid id/n") wx.Yield() time.sleep(3) del d quit() if self.MsgStartIndex > 0 and not INSTRUCTION_END in eachLine: message += eachLine[self.MsgStartIndex:] elif isAMessage and not INSTRUCTION_START in eachLine and not INSTRUCTION_END in eachLine: message += eachLine + "\n" if isAMessage and INSTRUCTION_END in eachLine: if self.MsgStartIndex > 0: if (eachLine.strip().startswith(INSTRUCTION_START)): message += eachLine[(self.MsgStartIndex):eachLine. index(INSTRUCTION_END)] + "\n" elif len(eachLine[:eachLine.index(INSTRUCTION_END)].strip( )) > 0: message += eachLine[:eachLine.index(INSTRUCTION_END )] + "\n" isAMessage = False msgPojo.setMsg(message) self.messageList.append(msgPojo) message = "" return self.messageList def getIdFromString(self, line): self.MsgStartIndex = -1 isSpaceInBetwnNum = False if "id" in line and "=" in line: index = line.index("=") id_ = "" cnt = 0 for chr_ in line[index:]: cnt += 1 if chr_ == " ": if len(id_) > 0: isSpaceInBetwnNum = True continue if chr_.isdigit(): if not isSpaceInBetwnNum: id_ += chr_ else: return "has invalid ID" elif chr_ == ">": self.MsgStartIndex = index + cnt break if (len(id_.strip()) == 6 and id_.strip().isdigit()): return id_.strip() else: return " has No ID" def readWPScript(self, WPFilename): lines = self.fileOperation.read(WPFilename) str_ = "" list_ = [] errorMessage = "" threatEndMet = True neutralEndMet = True for eachLine in lines: if eachLine.strip() == "" or eachLine.strip()[0] == '#': continue else: text = eachLine.strip() if text.startswith(WORD_PROBE_START) and len(text) == len( WORD_PROBE_START): wppojo = WPPojo() elif text.startswith(BLOCK_START) and BLOCK_END in text: blocknumber = eachLine[( eachLine.index(BLOCK_START) + len(BLOCK_START)):eachLine.index(BLOCK_END)].strip() if blocknumber.isdigit(): wppojo.setBlockNumber(blocknumber) else: errorMessage += "the block number for word probe is not a number!!" elif text.startswith( INSTRUCTION_BEFORE_BLOCK_START ) and INSTRUCTION_BEFORE_BLOCK_END in text: listTxt = eachLine[( eachLine.index(INSTRUCTION_BEFORE_BLOCK_START) + len(INSTRUCTION_BEFORE_BLOCK_START) ):eachLine.index(INSTRUCTION_BEFORE_BLOCK_END)].strip() if not listTxt == None and len(listTxt) > 0: list_ = listTxt.split(",") if not list_ == None and len(list_) > 0: wppojo.setBeforeBlockInstructionList(list_) elif text.startswith(WORD_PROBE_THREAT_WORD_START): if WORD_PROBE_THREAT_WORD_END in text: threatEndMet = True str_ = eachLine[( eachLine.index(WORD_PROBE_THREAT_WORD_START) + len(WORD_PROBE_THREAT_WORD_START) ):eachLine.index(WORD_PROBE_THREAT_WORD_END)].strip() list_ = [] list_ = str_.split(",") wppojo.setThreatWordList(list_) list_ = [] str_ = "" else: str_ = eachLine[( eachLine.index(WORD_PROBE_THREAT_WORD_START) + len(WORD_PROBE_THREAT_WORD_START)):].strip() threatEndMet = False elif not threatEndMet: if WORD_PROBE_THREAT_WORD_END in text: threatEndMet = True str_ = str_ + eachLine[:eachLine.index( WORD_PROBE_THREAT_WORD_END)].strip() list_ = str_.split(",") wppojo.setThreatWordList(list_) list_ = [] str_ = "" else: str_ = str_ + eachLine.strip() threatEndMet = False elif threatEndMet and text.startswith( WORD_PROBE_NEUTRAL_WORD_START): if WORD_PROBE_NEUTRAL_WORD_END in text: neutralEndMet = True str_ = eachLine[( eachLine.index(WORD_PROBE_NEUTRAL_WORD_START) + len(WORD_PROBE_NEUTRAL_WORD_START) ):eachLine.index(WORD_PROBE_NEUTRAL_WORD_END)].strip() list_ = str_.split(",") wppojo.setNeutralWordList(list_) list_ = [] str_ = "" else: str_ = eachLine[( eachLine.index(WORD_PROBE_NEUTRAL_WORD_START) + len(WORD_PROBE_NEUTRAL_WORD_START)):].strip() neutralEndMet = False elif not neutralEndMet: if WORD_PROBE_NEUTRAL_WORD_END in text: neutralEndMet = True str_ = str_ + eachLine[:eachLine.index( WORD_PROBE_NEUTRAL_WORD_END)].strip() list_ = str_.split(",") wppojo.setNeutralWordList(list_) list_ = [] str_ = "" else: str_ = str_ + eachLine.strip() neutralEndMet = False elif neutralEndMet and text.startswith( INSTRUCTION_AFTER_BLOCK_START ) and INSTRUCTION_AFTER_BLOCK_END in text: listTxt = eachLine[( eachLine.index(INSTRUCTION_AFTER_BLOCK_START) + len(INSTRUCTION_AFTER_BLOCK_START) ):eachLine.index(INSTRUCTION_AFTER_BLOCK_END)].strip() if not listTxt == None and len(listTxt) > 0: list_ = listTxt.split(",") if not list_ == None and len(list_) > 0: wppojo.setAfterBlockInstructionList(list_) elif text.startswith(WORD_PROBE_END): if wppojo.isValid(): self.WPPojoList.append(wppojo) else: errorMessage = self.message + " \nnumber of words or its types or number of probes-position are unequal for block " + blocknumber def readAllScripts(self, DPfilename, WPFilename): # readAllScripts from utils self.readDPScript(DPfilename) self.readWPScript(WPFilename) if len(self.errorMessage) > 0: dial = wx.MessageDialog(None, self.errorMessage, "abcd", wx.OK | wx.ICON_INFORMATION) dial.ShowModal() quit def readDPScript(self, DPfilename): lines = self.fileOperation.read(DPfilename) str_ = "" list_ = [] base_dir = os.path.dirname(__file__) path = os.path.dirname(base_dir) + "/dotProbe/images/" for eachLine in lines: if eachLine.strip() == "" or eachLine.strip()[0] == '#': continue else: text = eachLine.strip() if text.startswith(DOT_PROBE_START) and len(text) == len( DOT_PROBE_START): #start of each dot probe task block dpPojo = DPPojo() elif text.startswith(BLOCK_START) and BLOCK_END in text: blocknumber = eachLine[( eachLine.index(BLOCK_START) + len(BLOCK_START)):eachLine.index(BLOCK_END)].strip() if blocknumber.isdigit(): dpPojo.setBlockNumber(int(blocknumber)) else: self.errorMessage += "the block number for dot probe is not a number!!" elif text.startswith( INSTRUCTION_BEFORE_BLOCK_START ) and INSTRUCTION_BEFORE_BLOCK_END in text: listTxt = eachLine[( eachLine.index(INSTRUCTION_BEFORE_BLOCK_START) + len(INSTRUCTION_BEFORE_BLOCK_START) ):eachLine.index(INSTRUCTION_BEFORE_BLOCK_END)].strip() if not listTxt == None and len(listTxt) > 0: list_ = listTxt.split(",") if not list_ == None and len(list_) > 0: dpPojo.setBeforeBlockMsgList(list_) elif text.startswith(UP_START) and UP_END in text: str_ = eachLine[( eachLine.index(UP_START) + len(UP_START)):eachLine.index(UP_END)].strip() list_ = str_.split(",") self.message = "" if not self.validateImageFile(path, list_): if len(self.message) > 0: self.errorMessage += self.message else: dpPojo.setUpImageList(list_) elif text.startswith(UP_TYPE_START) and UP_TYPE_END in text: str_ = eachLine[(eachLine.index(UP_TYPE_START) + len(UP_TYPE_START) ):eachLine.index(UP_TYPE_END)].strip() list_ = str_.split(",") dpPojo.setUpImageType(list_) elif text.startswith(DOWN_START) and DOWN_END in text: str_ = eachLine[( eachLine.index(DOWN_START) + len(DOWN_START)):eachLine.index(DOWN_END)].strip() list_ = str_.split(",") self.message = "" if not self.validateImageFile(path, list_): if len(self.message) > 0: self.errorMessage += self.message else: dpPojo.setDownImageList(list_) elif text.startswith( DOWN_TYPE_START) and DOWN_TYPE_END in text: str_ = eachLine[(eachLine.index(DOWN_TYPE_START) + len(DOWN_TYPE_START) ):eachLine.index(DOWN_TYPE_END)].strip() list_ = str_.split(",") dpPojo.setDownImageType(list_) elif text.startswith(PROBE_START) and PROBE_END in text: str_ = eachLine[( eachLine.index(PROBE_START) + len(PROBE_START)):eachLine.index(PROBE_END)].strip() list_ = str_.split(",") if not self.validateType(list_): if len(self.message) > 0: self.errorMessage = self.message + self.errorMessage else: dpPojo.setProbePosnList(list_) elif text.startswith(INSTRUCTION_AFTER_BLOCK_START ) and INSTRUCTION_AFTER_BLOCK_END in text: listTxt = eachLine[( eachLine.index(INSTRUCTION_AFTER_BLOCK_START) + len(INSTRUCTION_AFTER_BLOCK_START) ):eachLine.index(INSTRUCTION_AFTER_BLOCK_END)].strip() if not listTxt == None and len(listTxt) > 0: list_ = listTxt.split(",") if not list_ == None and len(list_) > 0: dpPojo.setAfterBlockMsgList(list_) elif text.startswith(DOT_PROBE_END): if dpPojo.isValid(): self.DPPojoList.append(dpPojo) else: self.errorMessage = self.message + " \nnumber of images or its types or probes are unequal for block " + blocknumber def printDPPojoList(self): for DPPojo in self.DPPojoList: print DPPojo.toString() def printWPPojoList(self): for WpPojo_ in self.WPPojoList: print WpPojo_.toString() def validateImageFile(self, path, imageFileNameList): if len(imageFileNameList) <= 0: self.message = "no image file listed in the script\n" self.dialog.ShowMessage('Status Check', self.message) return False else: result = True for filename in imageFileNameList: if not (os.path.isfile(path + filename)): self.message += path + filename + " is not a valid file.\n" result = False return result def validateType(self, typeList): result = True if len(typeList) <= 0: self.message = "no probe type listed in the script\n" for eachType in typeList: if not (eachType.upper() == PROBE_DOWN or eachType.upper() == PROBE_UP): self.message = "PROBE cannot be other than up or down.\n" result = False return result def getMessagePojoByID(self, num_id): for msg in self.messageList: if msg.getID() == num_id: return msg return None
def __init__(self): self.message = "" print "===> utils.py initiated" self.fileOperation = FileOperations()
import json from FileOperations import FileOperations from sklearn.naive_bayes import MultinomialNB import scipy import time fo = FileOperations("../../input.json") fo.get_json() #fo.normalize() #tokens = fo.tokenize() #get the tf_idf data and label split = fo.num_lines / 100 * 80 data = fo.get_tfidf() lable = fo.get_value() #split the data to 80% and 20% train_data = data[:split] train_lable = lable[:split] test_data = data[split:] test_lable = lable[split:] start = time.time() clf_bayes = fo.train_bayes_model(train_data,train_lable) end = time.time() print "Train Time:" + str(end - start) + 's' start = time.time() TP, FP, FN, TN = fo.score(clf_bayes, test_data, test_lable) end = time.time() print "Test Time:" + str(end - start) + 's' print "Accuracy: " , float(TP + TN) / float(TP + FP + FN + TN)
def test_validate_file(self): validate_result = FileOperations.validate_file("./test/test_inputs/test_search_string.txt", ["Google Analytics"], {"CONTENT": {"registered": "'([\w\s]+)'"}, "built-in": ["DATE UPDATED"]}) self.assertEqual(validate_result["registered"][1], "Site Catalyst") self.assertGreater(time.localtime(), time.strptime(validate_result["DATE UPDATED"]))
def test_crawl_fake_directCrawl(self): parameters = FileOperations.get_from_JSON_file("./test/search_parameters.json") Crawler.crawl(parameters["crawling"], parameters["rules"], parameters["result"], parameters["output"]) self.assertTrue(os.path.isfile(parameters["output"]["path"])) result_from_file = FileOperations.get_from_JSON_file(parameters["output"]["path"]) self.assertEqual(len(result_from_file), 3)
def test_crawl_native_minimalParameterFile_multithreaded_native(self): parameters = FileOperations.get_from_JSON_file("./test/minimal_parameters.json") data = Crawler.crawl_multithread(parameters["crawling"], parameters["rules"], parameters.get("result")) self.assertEqual(data['./test/test_inputs/minimalist_data.txt']['matches']['HasName']['city'][0], 'London')
def check_depend_then_ren_and_embed_original_metadata( self, append_faststart=True, artwork=False, copy_chapters=False): """This method will run the "check_dependencies_then_render" method and attempt to embed any artwork from the original file into the output (due to how ffmpeg works, the artwork can't always be copied in one command.)\n if artwork is True it will try to embed artwork from the input into the output specifically. This may happen if ffmpeg tries to output artwork to the first stream of an audio only file.""" # Run standard command to render output. out_file_exists_result = self.check_depend_then_ren( append_faststart=append_faststart) if type(self.in_path) is list: in_meta_file = self.in_path[0] else: in_meta_file = self.in_path # If the output file exists then run the attempt_embed_metadata_silently method. if out_file_exists_result is True: # NOTE: This import is down here to avoid an infinite import. from FileOperations import FileOperations # This will attempt to embed any metadata (mainly for artwork) from the original file into the output. # (Due to how ffmpeg works, the artwork can't always be copied in one command.) # Create temporary output file with the original metadata embedded, delete the original output without the metadata, # and rename this temporary output to the desired output. for out_path in self.out_paths_list: temp_directory_to_embed_metadata = paths.Path().joinpath( out_path.parent, '--temp_dir_to_embed_metadata_silently') paths.Path.mkdir(temp_directory_to_embed_metadata) temp_out_file = paths.Path().joinpath( temp_directory_to_embed_metadata, out_path.stem + out_path.suffix) FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).copy_over_metadata( in_meta_file, copy_chapters) if temp_out_file.exists() is False: if self.print_err is True: print( f'Error, input file to extract metadata silently from "{out_path}" not found.' ) paths.Path(temp_directory_to_embed_metadata).rmdir() else: out_path.unlink() temp_out_file.rename(out_path) if artwork is True: temp_art = FileOperations( in_meta_file, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).extract_artwork() if temp_art is not False: if temp_art.exists(): FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).embed_artwork(temp_art) temp_art.unlink() out_path.unlink() temp_out_file.rename(out_path) temp_directory_to_embed_metadata.rmdir() return True else: # A problem occurred while rendering and no output file was created so quit. return False
class IndexGenerator: def __init__(self): self.fileOperation = FileOperations() # 0 1 2 3 4 5 6 7 8 9 10 # S.N. date subject trialcode currentblock number up_type down_type trialtimeout correct latency def calculateindex(self, filename): self.DAFI = 0.0 self.PAFI = 0.0 N_UP = 0.0 N_DOWN = 0.0 D_UP = 0.0 D_DOWN = 0.0 P_UP = 0.0 P_DOWN = 0.0 #*****************************************************************************************# #*******************for N_up and N _down************************# self.neutral_neutralCount = 0 self.neutral_neutral_case = False self.upProbeInNeutral_neutral = 0 self.downProbeInNeutral_neutral = 0 self.responseTimeOnUpProbe_inNeutral_neutral = 0.0 self.responseTimeOnDownProbe_inNeutral_neutral = 0.0 #*****************************************************************************************# #************************for D_up *****************************************# self.negative_up_anyCount = 0 self.probeUp_inNegative_up = 0 self.negative_up = False self.responseTimeOnProbe_Up_Negative_Up = 0.0 #*****************************************************************************************# #************************for D_down *****************************************# self.negative_down_anyCount = 0 self.probeDown_inNegative_down = 0 self.negative_down = False self.responseTimeOnProbe_Down_Negative_Down = 0.0 #*****************************************************************************************# #************************for P_up *****************************************# self.positive_Up_anyCount = 0 self.ProbeUp_inPositive_up = 0 self.positive_up = False self.responseTimeOnProbe_Up_Positie_Up = 0.0 #*****************************************************************************************# #************************for P_up *****************************************# self.positive_Down_anyCount = 0 self.probeDown_inPositie_down = 0 self.positive_down = False self.responseTimeOnProbe_Down_Positive_Down = 0.0 #*****************************************************************************************# lines = self.fileOperation.read(filename) data = [] lineNumberToNeglectForTraining = 0 lineNumber = 0 for eachLine in lines: lineNumber += 1 if eachLine.strip() == "" or eachLine.strip()[0] == '#': continue else: data = eachLine.split("\t") if len(data) > 12: print "length >11" else: if data[3].strip().upper( ) == REPORT.TRIALCODE[FIXATION_INTRIALCODE]: continue if data[3].strip().upper( ) == REPORT.TRIALCODE[PRACTISEIMAGE_INTRIALCODE]: #neglect the training data lineNumberToNeglectForTraining = lineNumber + 1 #if the trialcode=PRACTISE_PIC then we neglect upto the next line which contains the probe tracks continue if lineNumber <= lineNumberToNeglectForTraining: continue if self.positive_up: #P_UP self.positive_Up_anyCount += 1 self.positive_up = False if int(data[9].strip()) == 1 and data[3].strip().upper( ) == REPORT.TRIALCODE[UP_PROBE_INTRIALCODE]: self.ProbeUp_inPositive_up += 1 self.responseTimeOnProbe_Up_Positie_Up += float( data[10].strip()) * 1000 if self.positive_down: #P_DOWN self.positive_Down_anyCount += 1 self.positive_down = False if int(data[9].strip()) == 1 and data[3].strip().upper( ) == REPORT.TRIALCODE[dOWN_PROBE_INTRIALCODE]: self.probeDown_inPositie_down += 1 self.responseTimeOnProbe_Down_Positive_Down += float( data[10].strip()) * 1000 if self.negative_up: #D_UP self.negative_up_anyCount += 1 self.negative_up = False if int(data[9].strip()) == 1 and data[3].strip().upper( ) == REPORT.TRIALCODE[UP_PROBE_INTRIALCODE]: self.probeUp_inNegative_up += 1 self.responseTimeOnProbe_Up_Negative_Up += float( data[10].strip()) * 1000 if self.negative_down: #D_DOWN self.negative_down_anyCount += 1 self.negative_down = False if int(data[9].strip()) == 1 and data[3].strip().upper( ) == REPORT.TRIALCODE[dOWN_PROBE_INTRIALCODE]: self.probeDown_inNegative_down += 1 self.responseTimeOnProbe_Down_Negative_Down += float( data[10].strip()) * 1000 if self.neutral_neutral_case: #N_UP and N_DOWN self.neutral_neutral_case = False if int(data[9].strip()) == 1 and data[3].strip().upper( ) == REPORT.TRIALCODE[UP_PROBE_INTRIALCODE]: self.neutral_neutralCount += 1 self.upProbeInNeutral_neutral += 1 self.responseTimeOnUpProbe_inNeutral_neutral += float( data[10].strip()) * 1000 elif int(data[9].strip()) == 1 and data[3].strip( ).upper() == REPORT.TRIALCODE[dOWN_PROBE_INTRIALCODE]: self.neutral_neutralCount += 1 self.downProbeInNeutral_neutral += 1 self.responseTimeOnDownProbe_inNeutral_neutral += float( data[10].strip()) * 1000 elif not (data[3].strip().upper() == REPORT.TRIALCODE[dOWN_PROBE_INTRIALCODE] or data[3].strip().upper() == REPORT.TRIALCODE[UP_PROBE_INTRIALCODE]): print "this is due to incorrect data" if data[6].strip().upper() == "NEUTRAL" and data[7].strip( ).upper() == "NEUTRAL": #N_UP and N_DOWN self.neutral_neutral_case = True if data[6].strip().upper() == "NEGATIVE": #D_UP self.negative_up = True if data[7].strip().upper() == "NEGATIVE": #D_DOWN self.negative_down = True if data[6].strip().upper() == "POSITIVE": #P_UP self.positive_up = True if data[7].strip().upper() == "POSITIVE": #P_DOWN self.positive_down = True #*****************************************************************************************# if not self.neutral_neutralCount == (self.upProbeInNeutral_neutral + self.downProbeInNeutral_neutral): print "please verify the number of " + REPORT.TRIALCODE[ UP_PROBE_INTRIALCODE] + " and " + REPORT.TRIALCODE[ dOWN_PROBE_INTRIALCODE] + " .Their sum do not match the total number of total neutral neutral" print "all the calculations are done by neglecting the incorrect responses i.e. if correct=0 then that data is neglected" print "#*****************************************************************************************#" print "total no. of CASE: UP_NEUTRAL_DOWN_NEUTRAL = " + str( self.neutral_neutralCount) print "\n#**********************N_UP***************************#\n" print "total no. of CASE: UP_PROBE in UP_NEUTRAL_DOWN_NEUTRAL = " + str( self.upProbeInNeutral_neutral) print "sum of response time for CASE: UP_PROBE in UP_NEUTRAL_DOWN_NEUTRAL = " + str( self.responseTimeOnUpProbe_inNeutral_neutral) if not self.upProbeInNeutral_neutral == 0: N_UP = self.responseTimeOnUpProbe_inNeutral_neutral / self.upProbeInNeutral_neutral print "average response time for CASE: UP_PROBE in UP_NEUTRAL_DOWN_NEUTRAL = (N_UP) " + str( N_UP) print "\n#**********************N_DOWN***************************#\n" print "total no. of CASE:DOWN_PROBE in UP_NEUTRAL_DOWN_NEUTRAL = " + str( self.downProbeInNeutral_neutral) print "sum of response time for CASE: DOWN_PROBE in UP_NEUTRAL_DOWN_NEUTRAL = " + str( self.responseTimeOnDownProbe_inNeutral_neutral) if not self.downProbeInNeutral_neutral == 0: N_DOWN = self.responseTimeOnDownProbe_inNeutral_neutral / self.downProbeInNeutral_neutral print "average response time for CASE: DOWN_PROBE in UP_NEUTRAL_DOWN_NEUTRAL (N_DOWN) = " + str( N_DOWN) print "*************************************************************************************************\n" print "\n#**********************D_UP***************************#\n" # print "total no. of CASE: UP_NEGATIVE_DOWN_ANY = "+str(self.negative_up_anyCount) print "total no. of CASE: UP_PROBE in UP_NEGATIVE_DOWN_ANY = " + str( self.probeUp_inNegative_up) print "sum of response time for CASE: UP_PROBE in UP_NEGATIVE_DOWN_ANY = " + str( self.responseTimeOnProbe_Up_Negative_Up) if not self.probeUp_inNegative_up == 0: D_UP = self.responseTimeOnProbe_Up_Negative_Up / self.probeUp_inNegative_up print "average response time for CASE: UP_PROBE in UP_NEGATIVE_DOWN_ANY (D_UP) = " + str( D_UP) print "\n#**********************D_DOWN***************************#\n" # print "total no. of CASE: UP_ANY_DOWN_NEGATIVE = "+str(self.negative_down_anyCount) print "total no. of CASE: DOWN_PROBE in UP_ANY_DOWN_NEGATIVE = " + str( self.probeDown_inNegative_down) print "sum of response time for CASE: DOWN_PROBE in UP_ANY_DOWN_NEGATIVE = " + str( self.responseTimeOnProbe_Down_Negative_Down) if not self.probeDown_inNegative_down == 0: D_DOWN = self.responseTimeOnProbe_Down_Negative_Down / self.probeDown_inNegative_down print "average response time for CASE: DOWN_PROBE in UP_ANY_DOWN_NEGATIVE (D_DOWN) = " + str( D_DOWN) print "\n#**********************Distress Attentional Facilitation Index (Y)***************************#\n" self.DAFI = 0.5 * ((N_UP - D_UP) + (N_DOWN - D_DOWN)) print "Distress Attentional Facilitation Index (Y) = " + str(self.DAFI) print "*************************************************************************************************\n" print "\n#**********************P_UP***************************#\n" # print "total no. of CASE: UP_POSITIVE_DOWN_ANY = "+str(self.positive_Up_anyCount) print "total no. of CASE: UP_PROBE in UP_POSITIVE_DOWN_ANY = " + str( self.ProbeUp_inPositive_up) print "sum of response time for CASE: UP_PROBE in UP_POSITIVE_DOWN_ANY = " + str( self.responseTimeOnProbe_Up_Positie_Up) if not self.ProbeUp_inPositive_up == 0: P_UP = self.responseTimeOnProbe_Up_Positie_Up / self.ProbeUp_inPositive_up print "average response time for CASE: UP_PROBE in UP_NEGATIVE_DOWN_ANY (P_UP) = " + str( P_UP) print "\n#**********************P_DOWN***************************#\n" # print "total no. of CASE: UP_ANY_DOWN_POSITIVE = "+str(self.positive_Down_anyCount) print "total no. of CASE: DOWN_PROBE in UP_ANY_DOWN_POSITIVE = " + str( self.probeDown_inPositie_down) print "sum of response time for CASE: DOWN_PROBE in UP_ANY_DOWN_POSITIVE = " + str( self.responseTimeOnProbe_Down_Positive_Down) if not self.probeDown_inPositie_down == 0: P_DOWN = self.responseTimeOnProbe_Down_Positive_Down / self.probeDown_inPositie_down print "average response time for CASE: DOWN_PROBE in UP_ANY_DOWN_POSITIVE (P_DOWN) = " + str( P_DOWN) print "\n#**********************Positive Attentional Facilitation Index (P)***************************#\n" self.PAFI = 0.5 * ((N_UP - P_UP) + (N_DOWN - P_DOWN)) print "Positive Attentional Facilitation Index (P) = " + str(self.PAFI)
def __init__(self): self.fileOperation = FileOperations()
def test_crawl_native_minimalParameterFile_multithreaded(self): c = Crawler("MyMinimalCrawler", FileOperations.get_from_JSON_file("./test/minimal_parameters.json")) self.assertEqual(c.crawl_native(threads=10)['./test/test_inputs/minimalist_data.txt']['matches']['HasName']['city'][0], 'London')
class DistanceReader: def __init__(self, distReadFile, distSaveDirectory, distSaveFilename, interval, length): self.distanceReadFile = distReadFile self.distanceSaveFile = os.path.join(distSaveDirectory, distSaveFilename) self.sampleInterval = interval self.isRecording = False self.sampleLength = length self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.distLogger = FileOperations(distSaveDirectory, distSaveFilename) #starts recording distance at specified interval def startRecording(self): self.recordingLoopActive = True if (self.threadsStarted == False): threading.Timer(0, self.sampleDistanceWithInterval, ()).start() self.threadsStarted = True #requests recording thread to stop+pi- def stopRecording(self): self.recordingLoopActive = False #used to force the thread to stop recording if there was an error in recording data def resetIsRecording(self): self.isRecording = False #this method is called by timer threads to record data at the specified interval def sampleDistanceWithInterval(self): #launching next timer thread to record temp after specified interval self.nextTimer = threading.Timer(self.sampleInterval, self.sampleDistanceWithInterval, ()) self.nextTimer.start() if (self.recordingLoopActive == True and self.storage.hasSpace()): self.isRecording = True #line below ensures that, even when there is an error recording distance, isRecording won't stay on #The pi has 10 seconds to record temperature threading.Timer(self.sampleLength + 15, self.resetIsRecording, ()).start() timestamp = getTimestamp() try: self.logger.log("[DistanceSensor] started recording distance") end_time = time.time() + self.sampleLength while time.time() < end_time: distance = self.readDistance() timestamp = TimeUtils.getTimestamp() output = "%s %f\n" % (timestamp, distance) self.distLogger.appendToFile(output) self.logger.log("[DistacneReaader] recorded distance") except Exception as e: self.logger.logError("DistanceReader", "Error reading distance", e) self.isRecording = False #this method is to take input and output from the sensor and return the measured distance to measure_average def measure(self): GPIO.output(24, True) time.sleep(0.00001) GPIO.output(24, False) start = time.time() while GPIO.input(23) == 0: start = time.time() while GPIO.input(23) == 1: stop = time.time() elapsed = stop - start distance = (elapsed * 34300) / 2 return distance #this method returns average of distance to readDistance (this method is for accuracy) def measure_average(self): distance1 = self.measure() time.sleep(0.1) distance2 = self.measure() distance = distance1 + distance2 distance = distance / 2 return distance #this method is reading the distance def readDistance(self): with open(self.distanceReadFile, 'r') as distanceFile: while True: distance = self.measure_average() time.sleep(0.1) return distance GPIO.cleanup() #cancels any timers that are waiting to excecute. Used when quitting the program def quit(self): if self.nextTimer != None: self.nextTimer.cancel() def setLogger(self, logger): self.logger = logger def setStorage(self, storage): self.storage = storage
from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.corpus import stopwords import operator import os import re # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("taged.data") tages = fo.get_taged_data() origin = FileOperations("../input.json") origin.get_json() stop = set(stopwords.words('english')) pairs = dict() attributes = dict() regex = re.compile('[^a-zA-Z]') #this for loop is only used for get the attributes of task 2 for line in tages: for tag in line: if tag[1] == 'NN' or tag[1] == 'NNS':
#def getTimestamp(): # return time.strftime('%Y-%m-%d_%H-%M-%S') #utility method to set up GPIO used by sensors attached to the pi #called at the beginning of __main__ def setupGPIO(): os.system("sudo modprobe w1-therm") os.system("sudo modprobe w1-gpio") GPIO.setmode(GPIO.BCM) GPIO.setup(17, GPIO.IN, GPIO.PUD_UP) #need to set up filepath and filename when we get config in __main__ #used for all classes and threads in this file for logging purposes logger = FileOperations() #Need to setup actual minFreeMB once we get config data in __main__ #used by recording threads to check if there is enough room on the pi to record data storage = Storage() storage.setLogger(logger) #parses data in config file and returns a map of data entries to values def readConfig(): configDict = {} #finding and opening config file #parses config file with built in python config parser local_file_path = os.path.dirname(os.path.realpath(__file__)) + '/' config = ConfigParser.ConfigParser()
def test_save_to_CSV(self): FileOperations.save_dict_to_CSV({"first,row":{"first":1, "second":2, "third":3}}, "./test/test_outputs/save_csv.csv", {"First Col", "Second col", "Third col"}) f = open("./test/test_outputs/save_csv.csv") self.assertIsNotNone(f) f.close()
from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger from nltk.corpus import stopwords import operator import os import re # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("taged.data") tages = fo.get_taged_data() stop = set(stopwords.words('english')) attributes = dict() regex = re.compile('[^a-zA-Z]') for line in tages: for tag in line: if tag[1] == 'NN' or tag[1] == 'NNS': tag[0] = regex.sub('', tag[0]).lower() if tag[0] in stop or len(tag[0]) <= 1: tag[1] = 'STOP' elif tag[0] in attributes: attributes[tag[0]] += 1
if course.theme not in theme_list: theme_list.append(course.theme) return sorted(theme_list) if __name__ == "__main__": from FileOperations import FileOperations data_handler = None print("Sample data\n") input_filename = "data.a" file_handler = FileOperations(input_filename) if (file_handler.status): print(file_handler.data, "\n") data_handler = DataHandler(file_handler.data) data_handler.print_courses_list() from Command import VALID_COMMANDS_REQ print("\nCommand: locations") command = ["locations"] data_handler.process_command(command) print("\nCommand: courses <location> <theme>") print("\nEg: 1")
def test_getDataFromJSON(self): data = FileOperations.get_from_JSON_file("./test/search_parameters.json") self.assertIsNotNone(data) self.assertEqual(data["result"]["built-in"][2], "AUTHOR")
nameOut = (name + "_Constraints") DOPobject.separateChildContent(constraintsDF, "Constraint Type", ret=0, name=nameOut) nameOut = (name + "_Products") DOPobject.processSingleDataset(productDF, nameOut) from FileOperations import FileOperations expPath = root + config["DEFAULT"]["ExportPath"] #f = FileOperations("E:/CUA OpenBank API/OpenBanking/DataProcesing") f = FileOperations(root) i = 0 #for sheet in [3,4,5,6]: sheets = [ 'TRANS_AND_SAVINGS_ACCOUNTS', 'CRED_AND_CHRG_CARDS', 'TERM_DEPOSITS', 'TERM_DEPOSITS_RATES' ] for sheet in sheets: if sheet == 'TERM_DEPOSITS_RATES': rates = SP(dataFile, sheet) rates.log = log rates.path = exportPath + "/" rates.createDict()
def main(argv): BaseBackupArch = '' Directory = '' DatabaseUser = '' DatabaseGroup = '' PostgresConfig = '' PostgresPort = '' try: opts, args = getopt.getopt(argv,"ha:d:u:g:c:p:",["archive=","directory=","user="******"group=","config=","port="]) except getopt.GetoptError: print '%s -a <basebackup tar archive> -d <new PGDATA directory> [-u <postgres user, default postgres> -g <postgres group, default postgres>, -c <path to postgres config, default in new PGDATA dir> -p <port for new pg_cluster, default 5433>]' % os.path.abspath(__file__) sys.exit(2) if len(opts) == 0: print '%s -a <basebackup tar archive> -d <new PGDATA directory> [-u <postgres user, default postgres> -g <postgres group, default postgres>, -c <path to postgres config, default in new PGDATA dir> -p <port for new pg_cluster, default 5433>]' % os.path.abspath(__file__) sys.exit(2) for opt, arg in opts: if opt == '-h': print '%s -a <basebackup tar archive> -d <new PGDATA directory> [-u <postgres user, default postgres> -g <postgres group, default postgres>, -c <path to postgres config, default in new PGDATA dir> -p <port for new pg_cluster, default 5433>]' % os.path.abspath(__file__) sys.exit() elif opt in ("-a", "--archive"): BaseBackupArch = arg elif opt in ("-d", "--directory"): Directory = arg elif opt in ("-u", "--user"): DatabaseUser = arg elif opt in ("-g", "--group"): DatabaseGroup = arg elif opt in ("-c", "--config"): PostgresConfig = arg elif opt in ("-p", "--port"): PostgresPort = arg if (os.path.isfile(BaseBackupArch) and os.path.exists(Directory)): if (os.path.isabs(BaseBackupArch) and os.path.isabs(Directory)): ExtractFile(BaseBackupArch, Directory) logging.info('Succefully extracted basebackup archive file: %s into directory: %s' % (BaseBackupArch, Directory)) fop = FileOperations(Directory) if DatabaseUser and DatabaseGroup: fop.setFileOwner(DatabaseUser, DatabaseGroup) fop.setFilePerm() elif DatabaseUser: fop.setFileOwner(user=DatabaseUser) fop.setFilePerm() elif DatabaseGroup: fop.setFileOwner(group=DatabaseGroup) fop.setFilePerm() else: fop.setFileOwner() fop.setFilePerm() logging.info('Succefully prepare new PGDATA directory: %s' % Directory) if PostgresConfig: if os.path.isfile(PostgresConfig): if PostgresPort: process_config(PostgresConfig,PostgresPort) else: process_config(PostgresConfig) else: logging.error('%s no such file' % PostgresConfig) else: if Directory.endswith("/"): PostgresConfig=Directory + 'postgresql.conf' else: PostgresConfig=Directory + '/postgresql.conf' if os.path.isfile(PostgresConfig): if PostgresPort: if Directory.endswith('/'): process_config(config=PostgresConfig, logFileName=Directory.split('/')[-2], port=PostgresPort) else: process_config(config=PostgresConfig, logFileName=Directory.split('/')[-1], port=PostgresPort) else: if PostgresPort: if Directory.endswith('/'): process_config(config=PostgresConfig, logFileName=Directory.split('/')[-2]) else: process_config(config=PostgresConfig, logFileName=Directory.split('/')[-1]) else: logging.error('%s no such file' % PostgresConfig) else: logging.error('Check archive file or directory have an absolute path') sys.exit(2) else: logging.error('Check archive file or directory exists') sys.exit(2)
class TemperatureReader: def __init__(self, tempReadFile, tempSaveDirectory, tempSaveFilename, interval): self.temperatureReadFile = tempReadFile self.temperatureSaveFile = os.path.join(tempSaveDirectory,tempSaveFilename) self.sampleInterval = interval self.isRecording = False self.recordingLoopActive = False self.threadsStarted = False self.nextTimer = None self.tempLogger = FileOperations(tempSaveDirectory, tempSaveFilename) #starts recording temperature at specified interval def startRecording(self): self.recordingLoopActive = True if(self.threadsStarted == False): threading.Timer(0, self.sampleTemperatureWithInterval, ()).start() self.threadsStarted = True #requests recording thread to stop+pi- def stopRecording(self): self.recordingLoopActive = False #used to force the thread to stop recording if there was an error in recording data def resetIsRecording(self): self.isRecording = False #this method is called by timer threads to record data at the specified interval def sampleTemperatureWithInterval(self): #launching next timer thread to record temp after specified interval self.nextTimer = threading.Timer(self.sampleInterval, self.sampleTemperatureWithInterval, ()) self.nextTimer.start() if(self.recordingLoopActive == True and self.storage.hasSpace()): self.isRecording = True #line below ensures that, even when there is an error recording temperature, isRecording won't stay on #The pi has 10 seconds to record temperature threading.Timer(10, self.resetIsRecording, ()).start() try: temperature = self.readTemperature() timestamp = TimeUtils.getTimestamp() output = "%s %s\n" % (timestamp, temperature) #adding temperature to temperature file self.tempLogger.appendToFile(output) self.logger.log("[TemperatureReader] Recorded temperature") except Exception as e: self.logger.logError("TemperatureReader", "Error reading temperature", e) self.isRecording = False #parses system file to get temperature in Celcuis def readTemperature(self): with open(self.temperatureReadFile, 'r') as temperatureFile: text = temperatureFile.read() secondLine = text.split('\n')[1] temperatureData = secondLine.split(' ')[9] temperature = float(temperatureData[2:]) return temperature/1000 #cancels any timers that are waiting to excecute. Used when quitting the program def quit(self): if self.nextTimer != None: self.nextTimer.cancel() def setLogger(self, logger): self.logger = logger def setStorage(self, storage): self.storage = storage
import json from FileOperations import FileOperations import nltk from nltk.tag.stanford import StanfordPOSTagger import os # set the java environment variables: # CLASSPATH is the path to the stanford-postagger.jar in your local disk # STANFORD_MODELS is the path to the tagger file in your local disk os.environ[ 'CLASSPATH'] = '/home/sol315/Downloads/stanford-postagger-2015-12-09/stanford-postagger.jar' os.environ['STANFORD_MODELS'] = './models/english-left3words-distsim.tagger' fo = FileOperations("../input.json") fo.get_json() st = StanfordPOSTagger('english-bidirectional-distsim.tagger') f = open('taged.data', 'a') cur = 0 for line in fo.reviews: cur += 1 print cur, cur * 100 / fo.num_lines, '%' res = st.tag(line.split()) json_tag = json.dumps(res) f.write(json_tag) f.write('\n')
import argparse from FileOperations import FileOperations as FO from Crawler import Crawler import os.path parser = argparse.ArgumentParser(description='Crawl file and execute regex rules on them') parser.add_argument('-p', metavar='ParameterFilePath', type=argparse.FileType('r'), required=True, help="path to a parameter json file. Parameter file should contain a 'crawling', 'rules' and 'result' key") parser.add_argument('-o', metavar='OutputFilePath', type=argparse.FileType('w+'), help='output file. This argument is required if no output is specified in parameter file.\n The file must be either a .csv or .json') parser.add_argument('-mt', metavar='Thread Numbers', type=int, help='have a multi-threaded cralwer (1 thread per file) and precise the number of concurrent thread') parser.add_argument('-s', metavar='StartDirectory', type=str, help='directory in which the crawling will start. This parameter is necessary if there is no "crawling" dictionary in the parameter file') args = parser.parse_args() if "p" not in args or args.p is None: parser.error(parser.format_usage()) param = FO.get_from_JSON_file(args.p.name) if "rules" not in param or ("o" not in args and "output" not in param): print("rules error") parser.error(parser.format_usage()) if "crawling" not in param and ("s" not in args or args.s is None): parser.error(parser.format_usage()) elif "s" in args and args.s is not None: param["crawling"] = { "start": args.s} if "o" in args and args.o is not None: output_name, output_extension = os.path.splitext(args.o.name) param["output"] = { "path": args.o.name, "type": "csv" if ".csv" in output_extension else "json" } if "mt" in args and args.mt is not None: Crawler.crawl_multithread(param.get("crawling"), param.get("rules"), param.get("result"), param["output"], args.mt)
nameOut = (name + "_Constraints") DOPobject.separateChildContent(constraintsDF, "Constraint Type", ret=0, name=nameOut) nameOut = (name + "_Products") DOPobject.processSingleDataset(productDF, nameOut) from FileOperations import FileOperations expPath = root + config["DEFAULT"]["ExportPath"] #f = FileOperations("E:/CUA OpenBank API/OpenBanking/DataProcesing") f = FileOperations(root) i = 0 #for sheet in [3,4,5,6]: sheets = [ 'TRANS_AND_SAVINGS_ACCOUNTS', 'CRED_AND_CHRG_CARDS', 'TERM_DEPOSITS', 'TERM_DEPOSITS_RATES' ] for sheet in sheets: if sheet == 'TERM_DEPOSITS_RATES': rates = SP(dataFile, sheet) if not rates.df.iloc[:, 2:].empty: rates.log = log rates.path = exportPath + "/" rates.createDict()