def main(): tick("loading features data from files") x_train, x_test, y_train, y_test = readFeaturesAndLabels(datasetDir) tock("data is loaded into python objects") tick("training different classifiers with different configurations") # startIndex = 0 for config in expectedConfigs: i = config['N'] if (not (i in configsToRun)): print(f"ignoring index {i}..") continue isExist = len(list(glob(f"reports/N{i}*.txt"))) >= 1 if (isExist): print(f"already trained, ignoring index {i}..") continue del config['N'] tick(f"training the classifier with config {config}") currentClassifier = train(x_train, y_train, config) tock("classifier trained") tick("calculating the metrices") y_pred = currentClassifier.predict(x_test) mat = confusion_matrix(y_test, y_pred) rep = classification_report(y_test, y_pred) score = currentClassifier.score(x_test, y_test) tock("metrices calculated successfully") writeToFile(i, config, mat, rep, score, currentClassifier) tock("the whole test cases has ended")
def train(self, *phonesNames, limit=1000, loadFeat=False): self.scalerSet = limit trainSetGenerator = self._loadFeatures( *phonesNames, modelsSet=limit) if loadFeat else self._readTrainSet( limit=limit, customPhones=phonesNames) for phoneLabel, features in trainSetGenerator: # if(saveFeat and not loadFeat): self._saveFeatures(features, phoneLabel, limit) self._verbose("train model", phoneLabel) tick(f"timing total train time of {phoneLabel}") trainedModel = self._trainModel(phoneLabel, features) tock("train end") loc = os.path.join(self.modelsDir, str(limit)) os.makedirs(loc, exist_ok=True) loc = os.path.join(loc, phoneLabel) + self.ext_model self._saveModel(loc, trainedModel)
def _train(self, phoneLabel, limit, loadFeat): self._verbose(f"{phoneLabel}: train model", phoneLabel) trainSetGenerator = self._loadFeatures( phoneLabel, modelsSet=limit) if loadFeat else self._readTrainSet( limit=limit, customPhones=[phoneLabel]) label, features = next(trainSetGenerator) if (label != phoneLabel): raise RuntimeError( f"{phoneLabel}: invalid state, trainSetGenerator returns features of {label} but {phoneLabel} expected" ) tick(f"timing total train time of {phoneLabel}") trainedModel = self._trainModel(phoneLabel, features) tock(f"{phoneLabel}: train end") loc = os.path.join(self.modelsDir, str(limit)) os.makedirs(loc, exist_ok=True) loc = os.path.join(loc, phoneLabel) + self.ext_model self._saveModel(loc, trainedModel)
def jsonList2DF(msgList): '''Converts raw message data (as a List of Gmail API JSON responses) to a pandas dataframe with subject, date, and message body. Indexed by unique message ID.''' data = {} count = 0 print(len(msgList)) for msg in msgList: d = {} for header in msg['payload']['headers']: if header['name'] == 'From': d['from'] = header['value'] if header['name'] == 'Subject': d['subject'] = header['value'] d['body'] = getBody(msg['payload']) d['date'] = dt.datetime.fromtimestamp(int(msg['internalDate']) / 1e3) data[msg['id']] = d count += 1 if count % 2000 == 0: tock(float(count) / len(msgList)) return pd.DataFrame(data).transpose()
def emissions(self, *phones, path=None, modelsSet=200): ''' extract emissions probabilities of a file or all files in the dir of path is a dir ''' if (not os.path.exists(path)): raise FileNotFoundError("The path can't be found") paths = [path] if (os.path.isdir(path)): self._verbose( f"extracting emissions of dir {os.path.abspath(path)}") join = lambda f: os.path.join(path, f) exist = lambda f: os.path.exists( join(f.replace(self.ext_feat, self.ext_emissions))) paths = [ join(file) for file in sorted(os.listdir(path)) if file.endswith(self.ext_feat) and not exist(file) ] for featFile in paths: tick() self._fileEmissions(*phones, featPath=featFile, modelsSet=modelsSet) tock()
data is tuple of (features, lengths) ''' hmml = HMMTrainer(GMM=self.GMM, name=label) hmml.train(data[0], lens=data[1]) return hmml.model def _computeScore(self, model, data): ''' computes the score of the data given the model. the max-liklihood of generating the data from this model ''' features, lens = data return model.score(features, lengths=lens) def _modelInfo(self, model): ''' returns the model info of the given model ''' # TODO consider using dict as it is more convenient and easier # return { # "name": model.name, # "transmat": model.transmat_ # } return HMMInfo(model.name, transmat=model.transmat_) if __name__ == "__main__": from fire import Fire tick("timing the whole run") Fire(HMM_HMML) tock("the whole run")
flow = client.flow_from_clientsecrets('client_secret.json', SCOPES) creds = tools.run_flow(flow, store, flags) GMAIL = discovery.build('gmail', 'v1', http=creds.authorize(Http())) MSGS = GMAIL.users().messages() tick() msgList = [] response = MSGS.list(userId='me', q='ut-lists').execute() if 'messages' in response: msgList.extend(response['messages']) while 'nextPageToken' in response: token = response['nextPageToken'] response = MSGS.list(userId='me', q='ut-lists', pageToken=token).execute() msgList.extend(response['messages']) tock('getMessages') # batch = BatchHttpRequest() # for msg in messages: # batch.add(GMAIL.users().messages().get(userId = 'me', id = msg['id']), callback = callback) # batch.execute() # tock('execute batch') count = 0 rawMessages = [] tick() for msg in msgList: rawMessages.append(MSGS.get(userId='me', id=msg['id']).execute()) count += 1 if count % 2000 == 0: print(float(count) / len(msgList)) pickle.dump(rawMessages, open("messages.p", "wb")) tock('DONE')
def main(): parser = argparse.ArgumentParser(description="OCR parameters ") parser.add_argument('-search', '--search', help='Enable serach or not', required=True, type=str, default="False") parser.add_argument('-graph', '--graph', help="Text-format openfst decoding graph", required=False, default='LG.txt') parser.add_argument('-lmweight', '--lmweight', help='Relative weight of LM score', required=False, type=float, default=1) parser.add_argument('-beam_width', '--beam_width', help='Maximum token count per frame', required=False, type=int, default=250) parser.add_argument('-sentLen', '--sentLen', help='Number of words in a sentence given to search', required=True, type=int, default=1) parser.add_argument('-ilabels', '--ilabels', help="Text files containing input labels", type=str, required=True, default="input_labels.txt") # parser.add_argument('-refPath', '--refPath', # help="Folder continaing refernces text files which are also image files names to run OCR on it", # type=str, required=True, default=None) parser.add_argument('-predPath', '--predPath', help='path to write output hypotheses', type=str, required=True, default=None) parser.add_argument('-tp', '--timePath', help='path to write output time for each image', type=str, required=True, default=None) parser.add_argument('-imgsPath', '--imgsPath', help='Path where scanned images live', type=str, required=False, default='./scanned/') args = parser.parse_args() withSearch = args.search == "True" prog = OCR(args.graph, args.ilabels, lmWeight=args.lmweight, beamWidth=args.beam_width, sentLen=args.sentLen, withSearch=withSearch) timeFile = open(args.timePath, 'w') for fileName in os.listdir(args.imgsPath): startTime = time() print("Start image " + fileName) predictedText = prog.getTextFromImage(args.imgsPath + "/" + fileName) elapsedSeconds = ticktock.tock("", log=False) print(f'Image {fileName} took {int(time()-startTime)} seconds') with open(os.path.join(args.predPath, fileName.replace(".png", ".txt")), 'w', encoding="utf-8") as f: f.write(predictedText) timeFile.write(str(elapsedSeconds) + '\n') timeFile.close()
plt.ylabel('PCA 2') #! PCA of PCA # print("PCA of PCA") # print("number of sequences:", len(seqsLens)) # pca = sklearnPCA(n_components=2) #2-dimensional PCA # print(pd.DataFrame(sklearnPCA(n_components=40).fit_transform(reshaped[0])).to_numpy()) # input("wait") # pcaOfSeqs = np.array([pd.DataFrame(sklearnPCA(n_components=40).fit_transform(s)).to_numpy()[0] for s in reshaped]) # print(pcaOfSeqs.shape) # transformed = pd.DataFrame(pca.fit_transform(pcaOfSeqs)) # transformed = transformed.to_numpy() # plt.scatter(transformed[:, 0], transformed[:,1], label=phoneLabel) # plt.xlabel('PCA 1') # plt.ylabel('PCA 2') # plt.show() plt.legend() plt.show() # input("plotted") if __name__ == "__main__": tick("timing the whole time") from fire import Fire Fire(Inspection) tock() # main()