Example #1
0
def main():
    tick("loading features data from files")
    x_train, x_test, y_train, y_test = readFeaturesAndLabels(datasetDir)
    tock("data is loaded into python objects")

    tick("training different classifiers with different configurations")
    # startIndex = 0
    for config in expectedConfigs:
        i = config['N']
        if (not (i in configsToRun)):
            print(f"ignoring index {i}..")
            continue
        isExist = len(list(glob(f"reports/N{i}*.txt"))) >= 1
        if (isExist):
            print(f"already trained, ignoring index {i}..")
            continue
            
        
        del config['N']
        tick(f"training the classifier with config {config}")
        currentClassifier = train(x_train, y_train, config)
        tock("classifier trained")

        tick("calculating the metrices")
        y_pred = currentClassifier.predict(x_test)
        mat = confusion_matrix(y_test, y_pred)
        rep = classification_report(y_test, y_pred)
        score = currentClassifier.score(x_test, y_test)
        tock("metrices calculated successfully")

        writeToFile(i, config, mat, rep, score, currentClassifier)

    tock("the whole test cases has ended")
Example #2
0
 def train(self, *phonesNames, limit=1000, loadFeat=False):
     self.scalerSet = limit
     trainSetGenerator = self._loadFeatures(
         *phonesNames, modelsSet=limit) if loadFeat else self._readTrainSet(
             limit=limit, customPhones=phonesNames)
     for phoneLabel, features in trainSetGenerator:
         # if(saveFeat and not loadFeat): self._saveFeatures(features, phoneLabel, limit)
         self._verbose("train model", phoneLabel)
         tick(f"timing total train time of {phoneLabel}")
         trainedModel = self._trainModel(phoneLabel, features)
         tock("train end")
         loc = os.path.join(self.modelsDir, str(limit))
         os.makedirs(loc, exist_ok=True)
         loc = os.path.join(loc, phoneLabel) + self.ext_model
         self._saveModel(loc, trainedModel)
Example #3
0
 def _train(self, phoneLabel, limit, loadFeat):
     self._verbose(f"{phoneLabel}: train model", phoneLabel)
     trainSetGenerator = self._loadFeatures(
         phoneLabel, modelsSet=limit) if loadFeat else self._readTrainSet(
             limit=limit, customPhones=[phoneLabel])
     label, features = next(trainSetGenerator)
     if (label != phoneLabel):
         raise RuntimeError(
             f"{phoneLabel}: invalid state, trainSetGenerator returns features of {label} but {phoneLabel} expected"
         )
     tick(f"timing total train time of {phoneLabel}")
     trainedModel = self._trainModel(phoneLabel, features)
     tock(f"{phoneLabel}: train end")
     loc = os.path.join(self.modelsDir, str(limit))
     os.makedirs(loc, exist_ok=True)
     loc = os.path.join(loc, phoneLabel) + self.ext_model
     self._saveModel(loc, trainedModel)
Example #4
0
def jsonList2DF(msgList):
    '''Converts raw message data (as a List of Gmail API JSON responses) 
    to a pandas dataframe with subject, date, and message body.
    Indexed by unique message ID.'''
    data = {}
    count = 0
    print(len(msgList))
    for msg in msgList:
        d = {}
        for header in msg['payload']['headers']:
            if header['name'] == 'From':
                d['from'] = header['value']
            if header['name'] == 'Subject':
                d['subject'] = header['value']

        d['body'] = getBody(msg['payload'])
        d['date'] = dt.datetime.fromtimestamp(int(msg['internalDate']) / 1e3)
        data[msg['id']] = d
        count += 1
        if count % 2000 == 0:
            tock(float(count) / len(msgList))
    return pd.DataFrame(data).transpose()
Example #5
0
    def emissions(self, *phones, path=None, modelsSet=200):
        '''
			extract emissions probabilities of a file or all files in the dir of path is a dir
		'''
        if (not os.path.exists(path)):
            raise FileNotFoundError("The path can't be found")
        paths = [path]
        if (os.path.isdir(path)):
            self._verbose(
                f"extracting emissions of dir {os.path.abspath(path)}")
            join = lambda f: os.path.join(path, f)
            exist = lambda f: os.path.exists(
                join(f.replace(self.ext_feat, self.ext_emissions)))
            paths = [
                join(file) for file in sorted(os.listdir(path))
                if file.endswith(self.ext_feat) and not exist(file)
            ]
        for featFile in paths:
            tick()
            self._fileEmissions(*phones,
                                featPath=featFile,
                                modelsSet=modelsSet)
            tock()
Example #6
0
			data is tuple of (features, lengths)
		'''
        hmml = HMMTrainer(GMM=self.GMM, name=label)
        hmml.train(data[0], lens=data[1])
        return hmml.model

    def _computeScore(self, model, data):
        '''
			computes the score of the data given the model. the max-liklihood of generating the data from this model
		'''
        features, lens = data
        return model.score(features, lengths=lens)

    def _modelInfo(self, model):
        '''
			returns the model info of the given model
		'''
        # TODO consider using dict as it is more convenient and easier
        # return {
        # 	"name": model.name,
        # 	"transmat": model.transmat_
        # }
        return HMMInfo(model.name, transmat=model.transmat_)


if __name__ == "__main__":
    from fire import Fire
    tick("timing the whole run")
    Fire(HMM_HMML)
    tock("the whole run")
Example #7
0
    flow = client.flow_from_clientsecrets('client_secret.json', SCOPES)
    creds = tools.run_flow(flow, store, flags)
GMAIL = discovery.build('gmail', 'v1', http=creds.authorize(Http()))
MSGS = GMAIL.users().messages()
tick()
msgList = []
response = MSGS.list(userId='me', q='ut-lists').execute()
if 'messages' in response:
    msgList.extend(response['messages'])

while 'nextPageToken' in response:
    token = response['nextPageToken']
    response = MSGS.list(userId='me', q='ut-lists', pageToken=token).execute()
    msgList.extend(response['messages'])

tock('getMessages')
# batch = BatchHttpRequest()
# for msg in messages:
#     batch.add(GMAIL.users().messages().get(userId = 'me', id = msg['id']), callback = callback)
# batch.execute()
# tock('execute batch')
count = 0
rawMessages = []
tick()
for msg in msgList:
    rawMessages.append(MSGS.get(userId='me', id=msg['id']).execute())
    count += 1
    if count % 2000 == 0:
        print(float(count) / len(msgList))
pickle.dump(rawMessages, open("messages.p", "wb"))
tock('DONE')
Example #8
0
def main():
    parser = argparse.ArgumentParser(description="OCR parameters ")
    parser.add_argument('-search',
                        '--search',
                        help='Enable serach or not',
                        required=True,
                        type=str,
                        default="False")
    parser.add_argument('-graph',
                        '--graph',
                        help="Text-format openfst decoding graph",
                        required=False,
                        default='LG.txt')
    parser.add_argument('-lmweight',
                        '--lmweight',
                        help='Relative weight of LM score',
                        required=False,
                        type=float,
                        default=1)
    parser.add_argument('-beam_width',
                        '--beam_width',
                        help='Maximum token count per frame',
                        required=False,
                        type=int,
                        default=250)
    parser.add_argument('-sentLen',
                        '--sentLen',
                        help='Number of words in a sentence given to search',
                        required=True,
                        type=int,
                        default=1)
    parser.add_argument('-ilabels',
                        '--ilabels',
                        help="Text files containing input labels",
                        type=str,
                        required=True,
                        default="input_labels.txt")
    # parser.add_argument('-refPath', '--refPath',
    #                     help="Folder continaing refernces text files which are also image files names to run OCR on it",
    #                     type=str, required=True, default=None)
    parser.add_argument('-predPath',
                        '--predPath',
                        help='path to write output hypotheses',
                        type=str,
                        required=True,
                        default=None)
    parser.add_argument('-tp',
                        '--timePath',
                        help='path to write output time for each image',
                        type=str,
                        required=True,
                        default=None)
    parser.add_argument('-imgsPath',
                        '--imgsPath',
                        help='Path where scanned images live',
                        type=str,
                        required=False,
                        default='./scanned/')

    args = parser.parse_args()

    withSearch = args.search == "True"
    prog = OCR(args.graph,
               args.ilabels,
               lmWeight=args.lmweight,
               beamWidth=args.beam_width,
               sentLen=args.sentLen,
               withSearch=withSearch)

    timeFile = open(args.timePath, 'w')
    for fileName in os.listdir(args.imgsPath):
        startTime = time()
        print("Start image " + fileName)

        predictedText = prog.getTextFromImage(args.imgsPath + "/" + fileName)
        elapsedSeconds = ticktock.tock("", log=False)

        print(f'Image {fileName} took {int(time()-startTime)} seconds')
        with open(os.path.join(args.predPath, fileName.replace(".png",
                                                               ".txt")),
                  'w',
                  encoding="utf-8") as f:
            f.write(predictedText)
        timeFile.write(str(elapsedSeconds) + '\n')
    timeFile.close()
Example #9
0
		plt.ylabel('PCA 2')

		#! PCA of PCA
		# print("PCA of PCA")
		# print("number of sequences:", len(seqsLens))
		# pca = sklearnPCA(n_components=2) #2-dimensional PCA 
		# print(pd.DataFrame(sklearnPCA(n_components=40).fit_transform(reshaped[0])).to_numpy())
		# input("wait")
		# pcaOfSeqs = np.array([pd.DataFrame(sklearnPCA(n_components=40).fit_transform(s)).to_numpy()[0] for s in reshaped])
		# print(pcaOfSeqs.shape)
		# transformed = pd.DataFrame(pca.fit_transform(pcaOfSeqs))
		# transformed = transformed.to_numpy()
		# plt.scatter(transformed[:, 0], transformed[:,1], label=phoneLabel)
		# plt.xlabel('PCA 1')
		# plt.ylabel('PCA 2')


		# plt.show()
	plt.legend()
	plt.show()

	# input("plotted")


if __name__ == "__main__":
	tick("timing the whole time")
	from fire import Fire
	Fire(Inspection)
	tock()

	# main()