def removesmallchar(): basepath = bf.getdrt() for root, dirs, files in os.walk(basepath): for file in files: infile = open(os.path.join(root, file), 'r') lines = 0 words = 0 characters = 0 for line in infile: line = line.strip(os.linesep) wordslist = line.split() lines = lines + 1 words = words + len(wordslist) characters = characters + len(line) '''print(file) print(lines) print(words) print(characters) print('-----')''' infile.close() if characters < 355: os.remove(os.path.join(root, file)) print('removed file: ' + os.path.join(root, file))
import sys import os sys.path.insert(1,os.path.join(os.getcwd(),'dependencies')) import buildfolder as bf import shutil # from shutil import copyfile # Change basepath if applicable #basepath = "C:\\Users\\AYuen\\Environmental Protection Agency (EPA)\\ECMS - Documents\\Categorization Data\\" basepath = bf.getdrt() #copypath = "C:\\Users\\AYuen\\Environmental Protection Agency (EPA)\\ECMS - Documents\\newfiles\\" copypath = bf.getdrt() + '\\' # Get all files in the directory qq = [] # Check for unwanted file extensions filterout = ['aiff','arc','asc','avi','bwf','csi','dbf','ddf','dht','dng','dpx','dqt','e00','ebcdic','flac','gdb','gml','ics','jfif','kml','mbox','mov','mp3','mpeg2','mpeg4','mxf','prc','pst','shp','shx','step','u3d','utf16','utf8','warc','wave','wmv','x3d','x3dv'] for (root, dirs, files) in os.walk(basepath, topdown=False): if len(files) > 0: for file in files: qq.append(os.path.join(root,file)) for filename in qq: # Get the filename file = filename.split('\\')[-1] # Get the file extension fileext = filename.split(".")[-1] # For ezEmail - email content, if file is in the pdf folder extract record ID from end of filename if 'pdf' in filename and 'attachment' not in filename and fileext.lower() not in filterout:
from rake_nltk import Rake from bs4 import BeautifulSoup import sys import os sys.path.insert(1, os.path.join(os.getcwd(), 'dependencies')) import buildfolder as bf import datetime import xlsxwriter import xlrd import csv #select location where keywords.xlsx exists and where it will write the final resulting spreadsheet. rootdir = bf.getdrt() now = datetime.datetime.now() #delete any existing spreadsheet for fname in os.listdir(rootdir): if fname.startswith("Keyword Spreadsheet"): os.remove(os.path.join(rootdir, fname)) print('Existing Spreadsheet Deleted') workbook = xlsxwriter.Workbook(rootdir + '//' + 'Keyword Spreadsheet' + '_' + now.strftime('%m-%d-%y') + '.xlsx') worksheet = workbook.add_worksheet("Sheet 1") # Start from the first cell. # Rows and columns are zero indexed. row1 = 1 col = 0 #get list of files
import string import time import sys import os sys.path.insert(1, os.path.join(os.getcwd(), 'dependencies')) import buildfolder as bf import shutil import random import re #### COMMENT THIS OUT TOO #stdoutOrigin=sys.stdout #sys.stdout = open("log.txt", "w") sourcepath = bf.getdrt() finalpath = bf.getdrt() + '\\' def remove_punc(str): return re.sub("\d+", " ", ''.join(c for c in str if c not in punctuation)) def removesmallchar(sourcepath, finalpath=finalpath): sourcepath = sourcepath finalpath = finalpath directory = "smallfiles" destpath = os.path.join(finalpath + directory) if not os.path.exists(destpath): print('made folder ' + destpath) os.mkdir(destpath)
if __name__ == "__main__": #get source directory #get target directory #open the logfile #build q #iterate through q #check for similar file name in folder #tika extract # #save directly to target directory source = bf.getdrt('source') target = bf.getdrt('target') source_q = bf.buildq(source) log = open(os.path.join(target, 'logfile.txt'), 'w+', encoding="utf8", errors='ignore') def process(q): for p, i in enumerate( q[38975:] ): #set the counter here enumerate(q[start:]): example enumerate(q[5:]): content = '' filename = i.split('/')[-1]
import sys sys.path.insert(1,os.path.join('\'.join(os.getcwd().split('\')[:-1]),'dependencies')) import cxwalk import buildfolder import os, zipfile #Old Schedule id must be contained within the name of the zip file #Unzip file, save to folder of schedule id contained in name of file if __name__ == "__main__": print('Starting Script') fail = [] #get the source/target directory source = buildfolder.getdrt('source') target = buildfolder.getdrt('target') #build the folder q source_q = buildfolder.buildq(source) #for each item in q, translate the item, then make the folder count = 0 print(len(source_q)) fail = open(os.path.join(target,'faillog.txt'),'w+') for item in source_q: # loop through items in q count += 1 if not item.endswith(".zip"): continue try: zipObj = zipfile.ZipFile(item, 'r') print(item)
### Track the count ## import os import sys sys.path.insert(1,os.path.join('\\'.join(os.getcwd().split('\\')[:-1]),'dependencies')) import dctmdl as dd import buildfolder as bf #destination = r'C:\Users\mnguyen\Desktop\test' destination = bf.getdrt() #sourcelist = r'C:\Users\mnguyen\Environmental Protection Agency (EPA)\ECMS - Documents\github\Document_Processing_Scripts\Dwl Obj by ID from Schedule\objid.csv' sourcelist = 'objid.csv' import csv import logging logger = logging.getLogger(__name__) #except: # logging.basicConfig(filename='download') #with open(sourcelist, newline='') as csvfile: #reader = csv.reader(csvfile, delimiter=',') csvfile = open(sourcelist, newline='') reader = csv.reader(csvfile, delimiter=',') count = 0 username = '******'