def runAnnotate(reader, alg, paramDict, outName): """ annotate all articles in reader, write to outName in an atomic way via tempfiles kept on local disk that are only moved over to final on success. Starts a new output file if an empty row is returned from the annotator. """ addFields = paramDict.get("addFields", []) tmpFnames = [] outFh, tmpFnames = newTempOutFile(tmpFnames, outName, alg, addFields) for row in runAnnotateIter(reader, alg, paramDict, addFields): if len(row)==0 and outName!="stdout": outFh.close() outFh, tmpFnames = newTempOutFile(tmpFnames, outName, alg, addFields) continue row = [pubStore.removeTabNl(x) for x in row] line = "\t".join(row) outFh.write(line) outFh.write("\n") if "cleanup" in dir(alg): logging.info("Running cleanup") alg.cleanup() if outName!="stdout": outFh.close() moveManyTempToFinal(tmpFnames, outName)
def writeRow(row, outFh): " write list as tab-sep to ofh " newRow = [pubStore.removeTabNl(unicode(x)) for x in row] newRow = [i.encode('utf8') if isinstance(i, unicode) else str(i) for i in newRow] outFh.write("\t".join(newRow)) outFh.write("\n")
def writeAnnotations(alg, articleData, fileData, outFh, annotIdAdd, doSectioning, addFields): """ use alg to annotate fileData, write to outFh, adding annotIdAdd to all annotations return next free annotation id. """ annotDigits = int(pubConf.ANNOTDIGITS) fileDigits = int(pubConf.FILEDIGITS) annotIdStart = (int(fileData.fileId) * (10**annotDigits)) + annotIdAdd logging.debug("fileId %s, annotIdStart %d, fileLen %d" % (fileData.fileId, annotIdStart, len(fileData.content))) text = fileData.content.replace("\a", "\n") if fileData.fileType=="supp": sections = {"supplement": (0, len(text))} else: allTextSections = {"unknown": (0, len(text))} if doSectioning: sections = pubGeneric.sectionRanges(text) if sections==None: sections = allTextSections else: sections = allTextSections annotCount = 0 for section, sectionRange in sections.iteritems(): secStart, secEnd = sectionRange if section!="unknown": logging.debug("Annotating section %s, from %d to %d" % (section, secStart, secEnd)) secText = text[secStart:secEnd] fileData = fileData._replace(content=secText) annots = alg.annotateFile(articleData, fileData) if annots==None: logging.debug("No annotations received") continue for row in annots: # prefix with fileId, extId logging.debug("received annotation row: %s" % str(row)) fields = ["%018d" % (int(annotIdStart)+annotCount)] if articleData!=None: extId = articleData.externalId else: extId = "0" fields.append(extId) # add addFields artDict = articleData._asdict() if addFields!=None: for addField in addFields: fields.append(artDict.get(addField, "")) # add other fields fields.extend(row) # check if alg actually returns coordinates if alg.headers[0]=="start" and alg.headers[1]=="end": start, end = row[0:2] if (start,end) == (0,0): snippet = None else: snippet = getSnippet(secText, start, end) # lift start and end if sectioning start = secStart+int(start) end = secStart+int(end) # postfix with snippet logging.debug("Got row: %s" % str(row)) if doSectioning: fields.append(section) if snippet!=None: fields.append(snippet) #fields = [unicode(x).encode("utf8") for x in fields] fields = [pubStore.removeTabNl(unicode(x)) for x in fields] line = "\t".join(fields) outFh.write(line+"\n") annotCount+=1 assert(annotCount<10**annotDigits) # we can only store 100.000 annotations per file return annotCount
def writeRow(row, outFh): " write list as tab-sep to ofh " newRow = [pubStore.removeTabNl(unicode(x)) for x in row] outFh.write("\t".join(row)) outFh.write("\n")