''' Created on 30 Apr 2019 @author: ostlerr ''' import os from imageToText.YieldBookToData import getPageScan, correctWords, removePunctuation import configparser import re config = configparser.ConfigParser() config.read('config.ini') #experiment = config['EXPERIMENT']['name'] #outfile = open(config['EXPERIMENT']['outfile'], "w+", 1) srcdocs = config['EXPERIMENT']['srcdocs'] fileList = os.listdir(srcdocs) fileList.sort() for fname in fileList: nyear = fname[0:4] npage = fname[4:6] print(nyear + " - " + npage) page = getPageScan(srcdocs + "\\" + fname) print(page)
def loopDocs(): global year fileList = os.listdir(srcdocs) fileList.sort() for fname in fileList: nyear = fname[0:4] #if int(nyear) >= 1992 and int(nyear) <= 2006 and fname.endswith(".jpg"): if fname.endswith(".jpg"): rawPage = getPageScan(srcdocs + "\\" + fname) print("RAWPAGE: [" + rawPage + "]") #rawPage = rawPage.replace("\n"," ") # This trick is for retaining line breaks, while allowing for testing line break joined words... rawPage = correctWords(rawPage.split(" "), corrections) metadata = Metadata() hasMetadata = False if rawPage.find("Object:") > -1: hasMetadata = True page = rawPage page = trimPage(page, "Object:", "Sponsors:") metadata.object = page.replace("\n", " ") print("OBJECT: [" + metadata.object + "]") metadata.field = identifyField(metadata.object) print("FIELD: [" + metadata.field + "]") if rawPage.find("Design:") > -1: page = rawPage page = trimPage(page, "Design:", "Plot dimensions") metadata.design = page.replace("\n", " ") print("DESIGN: [" + metadata.design + "]") if rawPage.lower().find("plot dimensions:") > -1: page = rawPage page = trimPage(page, "Plot dimensions:", "Treatments") metadata.wholeplots = page.replace("\n", " ") print("DIMENSIONS: [" + metadata.wholeplots + "]") #if rawPage.lower().find("Sub-plot dimensions") if rawPage.find("Treatments:") > -1: page = rawPage page = trimPage(page, "Treatments:", "Experimental diary") metadata.treatments = page.replace("\n", "\ ") # markdown paragraph print("TREATMENTS: [" + metadata.treatments + "]") if hasMetadata: metadataOutfile.write(experiment + "|" + str(nyear) + "|" + metadata.field + "|" + metadata.object + "|" + metadata.design + "|" + metadata.wholeplots + "|" + metadata.subplots + "|" + metadata.treatments) metadataOutfile.write("\n") if rawPage.find("Sponsors:") > -1: # or page.find("$$Seed") > -1: page = rawPage #page = page.replace("$$", " ") page = trimPage(page, "Sponsors:", "The") page = page.replace("\n", " ") print(page) sponsors = getSponsors(page) for sponsor in sponsors: sponsorOutfile.write(experiment + "," + str(nyear) + "," + sponsor) sponsorOutfile.write("\n")