def _convert_dataset(split_name, filenames, dataset_dir): assert split_name in ['train', 'test'] with tf.Session() as sess: output_filename = os.path.join(TFRECORD_DIR, split_name + '.tfrecords') with tf.python_io.TFRecordWriter(output_filename) as tfrecord_writer: for i, filename in enumerate(filenames): try: sys.stdout.write('\r>> Converting image %d/%d' % (i + 1, len(filenames))) sys.stdout.flush() image_data = Image.open(filename) image_data = image_data.resize((224, 224)) image_data = np.array(image_data.convert('L')) image_data = image_data.tobytes() labels = filename.split('/')[-1][0:4] num_labels = [] for j in range(4): num_labels.append(int(labels[j])) example = image_to_tfexample(image_data, num_labels[0], num_labels[1], num_labels[2], num_labels[3]) tfrecord_writer.write(example.SerializeToString()) except IOError as e: print('Could not read:', filename) print('Error:', e) print('Skip it\n') sys.stdout.write('\n') sys.stdout.flush()
def __prepareData(self, pathsToFold, experiment, prot_attr_name): ''' reads training scores and predictions from disc and arranges them NICELY into a dataframe ''' pred_files = list() predictedScores = {} for filename in os.listdir(self.__trainingDir): if 'GermanCredit_'+prot_attr_name+'_ALG' in filename: delta = float((filename.split('=')[1]).split('.txt')[0]) predictedScores[delta] = pd.read_csv(self.__trainingDir+'/'+filename, sep=",", header=0) if 'german' in self.__dataset: groundtruth = pd.read_csv(self.__trainingDir+'/'+'GermanCredit_'+prot_attr_name+'.csv', sep=",", header=0) if self.rev: groundtruth['score'] = groundtruth['score'].apply(lambda val: 1-val) groundtruth = (groundtruth.sort_values(by=['score'], ascending=False)).reset_index(drop=True) elif 'biased_normal' in self.__dataset: groundtruth = pd.read_csv(self.__trainingDir+'/'+'BiasedNormalSynthetic_'+prot_attr_name+'.csv', sep=",", header=0) if self.rev: groundtruth['score'] = groundtruth['score'].apply(lambda val: 1-val) groundtruth = (groundtruth.sort_values(by=['score'], ascending=False)).reset_index(drop=True) elif 'compas' in self.__dataset: groundtruth = pd.read_csv(self.__trainingDir+'/'+'ProPublica_'+prot_attr_name+'.csv', sep=",", header=0) if not self.rev: groundtruth['Recidivism_rawscore'] = groundtruth['Recidivism_rawscore'].apply(lambda val: 1-val) groundtruth = (groundtruth.sort_values(by=['Recidivism_rawscore'], ascending=False)).reset_index(drop=True) groundtruth['doc_id'] = np.arange(len(groundtruth))+1 return predictedScores, groundtruth
def retrieve_htmlpage_identifier(self,filename): str=list() filename = filename.replace("http://", "").replace("https://", "") for part in filename.split("/"): if not("." in part): if utils.contains_digits(part): str.append(part) return str
def imageUpload(): if request.files['file']: file = request.files['file'] ext = file.filename.split(".")[-1] #if file.filename is None or file.filename == '': #file.filename = '' #return file.filename if ext.lower() in ALLOWED_EXTENSIONS: filename = secure_filename(file.filename) file.filename = filename.split('.')[0] + '_' + str(uuid.uuid4()) + '.' + filename.split('.')[1] file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename)) return file.filename elif ext.lower() not in ALLOWED_EXTENSIONS: print ext.lower() #flash('ERROR! Please upload only jpeg, png, gif or tif.', 'danger') return redirect(request.url) else: return 0
def imageUpload(): # check if the post request has the file part #if 'file' not in request.files: #return redirect(request.url) if 'file' in request.files: #return redirect(request.url) file = request.files['file'] else: file.filename = '' return file.filename if file.filename.split(".")[1] in ALLOWED_EXTENSIONS: print 1 #if file and (file.filename.split(".")[1] == "jpeg" or file.filename.split(".")[1] == "jpg"): filename = secure_filename(file.filename) file.filename = filename.split('.')[0] + '_' + str( uuid.uuid4()) + '.' + filename.split('.')[1] file.save(os.path.join(app.config['UPLOAD_FOLDER'], file.filename)) return file.filename elif file.filename is None or file.filename == '': file.filename = '' return file.filename elif file.filename.split(".")[1] not in ALLOWED_EXTENSIONS: print 0 #file.filename = '' flash('ERROR! Please upload only jpeg, png, gif or tif.', 'danger') #return file.filename #elif file and (file.filename.split(".")[1] != "jpeg" or file.filename.split(".")[1] != "jpg"): form = addPointForm(request.form) d_conn = pg_operations2.pg_connect2(database, user, password, host, port) conn = d_conn['conn'] conn.rollback() d_conn = pg_operations2.pg_disconnect2(d_conn) flash('ERROR! Please uplod only jpeg, png, gif or tif.', 'danger') return render_template('registerTrashReporter..html', form=form)
def getCaptionRowAndPosition(filepath): # Get the filename part, without the extension filename, extension = os.path.splitext(os.path.split(filepath)[1]) # If it starts with numbers, we have a row (and position?) section row = -1 # No row specified position = -1 # No position caption = "" if filename[0].isdigit(): # Get the first part, e.g "10-01" or "10" row_pos = filename.split()[0] # The row is the first part, always row = int(row_pos.split("-")[0]) # The position is the second part, if it exists if len(row_pos.split("-")) > 1: position = int(row_pos.split("-")[1]) caption = filename.split(" ", 1)[1] #print "Split %s to get %s" % (filename, caption) else: caption = filename # Hacks to work around the oversize font: # 1. Prepend a space caption = " " + caption # 2. If the last four characters are NOT digits, append up to 4 digits # Note: we could be cleverer about this and append fewer for smaller letters append = " " for i in range(-3, 0, 1): if not caption[i].isdigit(): # Append 2 spaces if the non-digit is 3rd from the end, # or 3 if it is 3 from the end etc. append = " " * (i + 5) caption = caption + append #print "Row %d, position %d, caption %s" % (row, position, caption) return caption, row, position
def extractPageNumberFromFilename(self, filename): if utils.contains_digits(filename) == False: return 0 filePrefix = filename.split(".")[0] pageString = filePrefix.split("_")[-1] if not ('page' in pageString): for part in filePrefix: if 'page' in part: pageString = part pageStr = ''.join(x for x in pageString if x.isdigit()) filePage = int(pageStr) return filePage
def move_to_another(self, filename): try: entity_type = filename.split('|')[1] remote_filename = self._get_remote_filename(filename) if not remote_filename: return if (entity_type == 'County') or (entity_type == 'City') or \ (entity_type == 'Township') or (entity_type == 'Village'): return directory, server_filename = remote_filename self.ftp.rename('/General Purpose/{}'.format(server_filename), '/{}/{}'.format(directory, server_filename)) print('Moved {} to {}'.format(server_filename, directory)) except Exception as e: print(str(e))
def filter_already_present(self, json_filenames: List[str], spider: str) -> List[str]: # Retrieve the already processed files from the database and get their file names table_string = f"file {join_decision_and_language_on_parameter('file_id', 'file.file_id')}" where_string = f"file.file_id IN {where_string_spider('file_id', spider)}" all_filenames_of_spider = self.select(self.get_engine(self.db_scrc), table_string, "file_name", where_string) for filename_chunk in all_filenames_of_spider: filename_chunk = list(filename_chunk['file_name']) json_filenames = [ filename for filename in json_filenames if filename.split('/')[-1].split('.')[0] not in filename_chunk ] return json_filenames
def extractVolumeIssueFromFilename(self, filename): name=volume=issue="" if utils.contains_digits(filename) == False: return 0 filenameArr = filename.split(".") filePrefix = filenameArr[0] file = filePrefix.split("_") leng = len(file) if leng>=4: name = file[1]+" "+file[2] volume = file[(leng-2)] if volume != file[4]: issue = ""+file[3] +"_"+ file[4] else: issue = ""+file[3] else: return 0 issue = issue.replace("-", "") return name, volume, issue
def readSinglePatterns(root, resultListBox): for filename in os.listdir(singlePatternPath): if(checkFileName(filename)): try: data = xlrd.open_workbook(singlePatternPath + filename) table = data.sheets()[0] ncols = table.ncols row0 = table.row_values(0) row1 = table.row_values(1) pattern ={} for i in range(0,ncols): pattern[row0[i]] = row1[i] fileNameKey = filename.split('.',1)[0] singlePatternDict[fileNameKey] = pattern #MessageBoxPromt(0, 'Exception' + '\n good', root ) resultListBox.insert(END, 'Successfully read pattern ' + filename + '!') except Exception as e: MessageBoxPromt(0, 'Exception ' + str(e) + '\n ERROR 1 in reading pattern file', root ) resultListBox.insert(END, 'Failed to read pattern ' + filename + '! Exception: ' + str(e)) print (e) return 0 return 1
def execute_with_params(self, file_pattern="", path="", magazines=None,articles=None): listOfMagazines=[] container = dict() for folder, subs, files in os.walk(path): with open(os.path.join(folder, file_pattern), 'w') as dest: docValue="" prevId="" identifier="" page_count=0 for filename in files: #print("Filename "+filename) if filename == file_pattern: pass elif filename.endswith(file_pattern): document="" if not folder in path: document = folder+"/"+filename else: document = path+filename doc = self.read(document) if magazines==None: pass #container = self.saveValuesForNewIdentifier(container, identifier, doc) elif len(doc)<1: pass else: fileId = self.getMagazine(filename) mag=magazines[fileId] if fileId not in listOfMagazines: listOfMagazines.append(fileId) page = self.extractPageNumberFromFilename(filename) if page > 0: article = mag.find_article_by_page(page) logger.debug(article) logger.debug(filename) #get filename name="" str_name = filename.split("page") try: name = str_name[0] except: name = filename mag.set_name(name) if article != None: #If article exists if len(doc)<5 and len(values[-1:])<3900: logger.info("SMALL "+str(len(doc)) + " = "+doc) pass if len(doc)>0: self.setLengths(len(doc)) #split value in case too long to process if len(doc)>3000: sentences="" splitted = doc.split(' ') for split in splitted: lenn = len(sentences) + len(split) + 1 #+1 for the space if lenn > 3000: article.addText(sentences, page) sentences = "" sentences += split+" " if len(sentences) > 0: article.addText(sentences, page) else: article.addText(doc, page) article.set_len(len(doc)) else: #in case we cannot find article article = Article(filename, page, "") article.addText(doc, page) article.set_len(len(doc)) self.setLengths(len(doc)) mag.add_article(article) else: #if article not found for the document, store pages page by page print("stored file "+filename+" as article was not found") article = Article(filename, page, "") article.addText(doc, page) article.set_len(len(doc)) mag.add_article(article) self.setLengths(len(doc)) magazines[fileId] = mag #mag.log_articles_and_contents() elif filename.endswith(".xml"): page_count = 1+page_count article = Article(filename, page, "") doc, document = self.readDocument(path, folder, filename, document) if len(doc)>0: xml = xmlParser(input_file=doc) if bool(BeautifulSoup(html, "html.parser").find()) == True: self.setLengths(len(doc)) article.set_len(len(doc)) html = htmlParser(doc) article = self.split_document(html.get_text(), article, page_count) result.append(article) else: #process pass elif filename.endswith(".html"): page_count = 1+page_count article = Article(filename, page, "") doc, document = self.readDocument(path, folder, filename, document) if len(doc)>0: self.setLengths(len(doc)) article.set_len(len(doc)) html = htmlParser(doc) article = self.split_document(html.get_text(), article, page_count) result.append(article) else: #process pass dest.close() result = [] logger.debug("VALUES FOR magazines "+str(len(listOfMagazines))) for id in listOfMagazines: if magazines[id] not in result: result.append(magazines[id]) return result, listOfMagazines
def ArchiveFileS3(bucketname, filename, logfile, **s3params): #----------------------------------------------------------- #logging #----------------------------------------------------------- if logfile == '': msg = "logfile is mandatory, exiting" print(msg) sys.exit(1) currenttime = datetime.now().strftime('%d%m%Y_%H%M%S') logfileobj = open(logfile, "a") msg = "ArchiveFileS3 process started" logfileobj.write("\n{}: {}".format(currenttime, msg)) print(msg) #----------------------------------------------------------- #input parameters check and variable declration #----------------------------------------------------------- msg = '' if len(s3params) == 0: msg = "s3 params are not provided, exiting" elif bucketname == '': msg = "sourcebucket is mandatory, exiting" elif filename == '': msg = "targetbucket is mandatory, exiting" if len(msg) != 0: currenttime = datetime.now().strftime('%d%m%Y_%H%M%S') logfileobj.write("\n{}: {}".format(currenttime, msg)) print(msg) sys.exit(1) df = GetFileDatas3(bucketname, filename, logfile, **s3params) if 'date' in df.columns: datevalue = df['date'].max() datevalue = datetime.strftime(datetime.strptime(datevalue, '%d/%m/%Y'), '%d%m%Y') else: datevalue = datetime.now().strftime('%d%m%Y') #print(datevalue) archivebucket = bucketname + 'archive' archivefilename = filename.split('.')[0] archivefilextension = filename.split('.')[1] archivefile = archivefilename + '_' + datevalue + '.' + archivefilextension #print (archivefile) #----------------------------------------------------------- #archive bucket creation #----------------------------------------------------------- CreateBucketS3(archivebucket, logfile, **s3params) time.sleep(3) #----------------------------------------------------------- #file copy #----------------------------------------------------------- tobedeleted = CopyFileS3(bucketname, archivebucket, filename, archivefile, logfile, **s3params) #----------------------------------------------------------- #delete file #----------------------------------------------------------- if tobedeleted == 1: DeleteFileS3(bucketname, filename, logfile, **s3params)
and c in all_letters) print(unicodeToAscii('Ślusàrski')) # Build the category_lines a dicttionary, a list a names per language category_lines = {} all_categories = [] # Read a file and split into lines def readLines(filename): lines = open(filename, encoding='utf-8').read().strip().split('\n') return [unicodeToAscii(line) for line in lines] for filename in findFiles('data/names/*.txt'): category = filename.split('/')[-1].split('.')[0] all_categories.append(category) lines = readLines(filename) category_lines[category] = lines n_categories = len(all_categories) print(category_lines['names\\Korean'][:5]) import torch # Finde letter index from all_letters, e.g. "a" = 0 def letterToIndex(letter): return all_letters.find(letter)