def __init__(self, filename): self.docs = [] cf = open(filename) docid = '' title = '' author = '' body = '' for line in cf: if '.I' in line: if docid != '': body = buf self.docs.append(Document(docid, title, author, body)) # start a new document docid = line.strip().split()[1] buf = '' elif '.T' in line: None elif '.A' in line: title = buf # got title buf = '' elif '.B' in line: author = buf # got author buf = '' elif '.W' in line: buf = '' # skip affiliation else: buf += line self.docs.append(Document(docid, title, author, buf)) # the last one
def __init__(self, filename): self.docs = [] self.collection = Collection() cf = open(filename) docid = '' title = '' author = '' body = '' for line in cf: if '.I' in line: if docid != '': body = buf self.docs.append(Document(docid, title, author, body)) # start a new document docid = line.strip().split()[1] buf = '' elif '.T' in line: None elif '.A' in line: title = buf # got title buf = '' elif '.B' in line: author = buf # got author buf = '' elif '.W' in line: buf = '' # skip affiliation else: buf += line self.docs.append(Document(docid, title, author, buf)) # the last one # Create the document Collection for i in range(len(self.docs)): self.collection.add(self.docs[i].docID, self.docs[i])
def __init__(self, filename): self.docs = [] cf = open(filename) docid = "" title = "" author = "" body = "" for line in cf: if ".I" in line: if docid != "": body = buf self.docs.append(Document(docid, title, author, body)) # start a new document docid = line.strip().split()[1] buf = "" elif ".T" in line: None elif ".A" in line: title = buf # got title buf = "" elif ".B" in line: author = buf # got author buf = "" elif ".W" in line: buf = "" # skip affiliation else: buf += line self.docs.append(Document(docid, title, author, buf)) # the last one
def __read_yaml(self): with open(self.yaml) as f: contents = f.read() pattern = re.compile('^\#include[\s]+([\S]+)[\s]*$', re.MULTILINE) contents = pattern.sub(self.__yaml_include, contents) doc = Document(self, yaml.load(contents)) doc.save()
def __contains__( self, docid ): """Return whether the document identified by ``docid`` is present in the database. """ try : doc = Document( self, docid ) doc.head() return True except : return False
def readfiles(self, dirname, filename, subdir): #read file subject and last xx lines filepath = dirname + "\\" + filename cf = open(filepath) docid = filename + "_" + subdir number_of_lines = 0 title = '' body = '' linemessage = '' startlines = False for line in cf: if 'Subject:' in line: title = line[9:].strip() # got title elif 'Lines:' in line: try: number_of_lines = int(line[6:]) except Exception as e: if 'dog' in str(e): number_of_lines = 24 startlines = True line = '' if startlines: #last_line = cf.readlines()[-number_of_lines:] last_line = [ i.replace('\n', '') for i in cf.readlines()[-number_of_lines:] ] linemessage = ''.join(last_line) body = linemessage #convert file to document format self.docs.append(Document(docid, title, body))
def read_id_file_into_docs(file): docs = list() with open(file, mode="r", encoding="utf-8") as f: with click.progressbar(f.readlines(), label="Reading file") as bar: for l in bar: data = l.split("\t") if len(data) == 2: docs.append(Document(data[0], "", data[1], "")) else: print("NoNoNo") return docs
def read_pdf(path: str) -> List[Document]: page_docs = [] with open(path, 'rb') as f: reader = PdfFileReader(f) for i, page in enumerate(reader.pages): doc = Document(name=ntpath.basename(path), page=i + 1, content=page.extractText()) page_docs.append(doc) return page_docs
def main(file): doc = Document(file) input = open(file).read() border = re.compile('^-{20,}$', re.MULTILINE) if re.search(border, input): intro, text = re.split(border, input, 1) else: intro, text = "", input doc.init(intro) try: doc.render(text) except PressError as e: e.report() doc.save()
def addall(self, dirpath): dlist = os.listdir(dirpath) flist = [] for tmp in dlist: tmppath = os.path.join(dirpath, tmp) if os.path.isfile(tmppath): flist.append(tmp) elif os.path.isdir(tmppath): self.addall(tmppath) for f in flist: doc = Document(os.path.join(dirpath, f)) self.add(doc)
def build(self, dirpath): dlist = os.listdir(dirpath) flist = [] self.collect_files(dirpath, flist) for f in flist: doc = Document(os.path.join(dirpath, f)) tokens = tools.tokenize(doc.content, self.onlyalpha, self.stopwords, self.stemmer) self.vocab.addall(tokens) self.itable = itable.IndexTable(self.vocab, self.onlyalpha, self.stopwords, self.stemmer) self.itable.addall(dirpath) self.itable.compute_tfidf()
def correct_task(version=1): """ Correct text provided by the user, i.e. not coming from an article. This can be either an uploaded file or a string. This is a lower level API used by the Greynir web front-end. """ if not (1 <= version <= 1): return better_jsonify(valid=False, reason="Unsupported version") file = request.files.get("file") if file is not None: # Handle uploaded file # file is a proxy object that emulates a Werkzeug FileStorage object mimetype = file.mimetype if mimetype not in SUPPORTED_DOC_MIMETYPES: return better_jsonify(valid=False, reason="File type not supported") # Create document object from an uploaded file and extract its text try: # Instantiate an appropriate class for the MIME type of the file doc = Document.for_mimetype(mimetype)(file.read()) text = doc.extract_text() except Exception as e: logging.warning("Exception in correct_task(): {0}".format(e)) return better_jsonify(valid=False, reason="Error reading file") else: # Handle POSTed form data or plain text string try: text = text_from_request(request) except Exception as e: logging.warning("Exception in correct_task(): {0}".format(e)) return better_jsonify(valid=False, reason="Invalid request") # assert isinstance(request, _RequestProxy) pgs, stats = check_grammar(text, progress_func=cast(Any, request).progress_func) # Return the annotated paragraphs/sentences and stats # in a JSON structure to the client return better_jsonify(valid=True, result=pgs, stats=stats, text=text)
def __init__(self, files): self.docs = [] self.class1items1 = [] self.class1items2 = [] self.class1items3 = [] self.class1items4 = [] self.class1items5 = [] self.class1items6 = [] #find class id based on document number self.classDocumentLookup ={} for filename in files: newsGroupFile = open(filename) head, tail = os.path.split(filename) if head.find("comp.graphics") != -1: docid = tail+"comp.graphics" if head.find("comp.os.ms-windows.misc")!= -1: docid = tail + "comp.os.ms-windows.misc" if head.find("comp.sys.ibm.pc.hardware")!= -1: docid = tail + "comp.sys.ibm.pc.hardware" if head.find("comp.sys.mac.hardware")!= -1: docid = tail + "comp.sys.mac.hardware" if head.find("comp.windows.x")!= -1: docid = tail + "comp.windows.x" if head.find("rec.autos")!= -1: docid = tail + "rec.autos" if head.find("rec.motorcycles")!= -1: docid = tail + "rec.motorcycles" if head.find("rec.sport.baseball")!= -1: docid = tail + "rec.sport.baseball" if head.find("rec.sport.hockey")!= -1: docid = tail + "rec.sport.hockey" if head.find("sci.crypt")!= -1: docid = tail + "sci.crypt" if head.find("sci.electronics")!= -1: docid = tail + "sci.electronics" if head.find("sci.med")!= -1: docid = tail + "sci.med" if head.find("sci.space")!= -1: docid = tail + "sci.space" if head.find("misc.forsale")!= -1: docid = tail + "misc.forsale" if head.find("talk.politics.misc")!= -1: docid = tail + "talk.politics.misc" if head.find("talk.politics.guns")!= -1: docid = tail + "talk.politics.guns" if head.find("talk.politics.mideast")!= -1: docid = tail + "talk.politics.mideast" if head.find("talk.religion.misc")!= -1: docid = tail + "talk.religion.misc" if head.find("alt.atheism")!= -1: docid = tail + "alt.atheism" if head.find("soc.religion.christian")!= -1: docid = tail + "soc.religion.christian" subject = '' message = '' startread = False buf = '' for line in newsGroupFile: # print (line) if 'Subject:' in line: subject = line[9:] # got title elif 'Lines:' in line: startread = True line='' if startread: buf += line message=buf; self.docs.append(Document(docid, subject, message)) # the last one counter = 0 for temp in self.docs: if temp.docID.find("comp.graphics") != -1: self.class1items1.append(temp.docID) self.classDocumentLookup.setdefault('1', []).append(temp.docID) if temp.docID.find("comp.os.ms-windows.misc") != -1: self.class1items1.append(temp.docID) self.classDocumentLookup.setdefault('1', []).append(temp.docID) if temp.docID.find("comp.sys.ibm.pc.hardware") != -1: self.class1items1.append(temp.docID) self.classDocumentLookup.setdefault('1', []).append(temp.docID) if temp.docID.find("comp.sys.mac.hardware") != -1: self.class1items1.append(temp.docID) self.classDocumentLookup.setdefault('1', []).append(temp.docID) if temp.docID.find("comp.windows.x") != -1: self.class1items1.append(temp.docID) self.classDocumentLookup.setdefault('1', []).append(temp.docID) if temp.docID.find("rec.autos") != -1: self.class1items2.append(temp.docID) self.classDocumentLookup.setdefault('2', []).append(temp.docID) if temp.docID.find("rec.motorcycles") != -1: self.class1items2.append(temp.docID) self.classDocumentLookup.setdefault('2', []).append(temp.docID) if temp.docID.find("rec.sport.baseball") != -1: self.class1items2.append(temp.docID) self.classDocumentLookup.setdefault('2', []).append(temp.docID) if temp.docID.find("rec.sport.hockey") != -1: self.class1items2.append(temp.docID) self.classDocumentLookup.setdefault('2', []).append(temp.docID) if temp.docID.find("sci.crypt") != -1: self.class1items3.append(temp.docID) self.classDocumentLookup.setdefault('3', []).append(temp.docID) if temp.docID.find("sci.electronics") != -1: self.class1items3.append(temp.docID) self.classDocumentLookup.setdefault('3', []).append(temp.docID) if temp.docID.find("sci.med") != -1: self.class1items3.append(temp.docID) self.classDocumentLookup.setdefault('3', []).append(temp.docID) if temp.docID.find("sci.space") != -1: self.class1items3.append(temp.docID) self.classDocumentLookup.setdefault('3', []).append(temp.docID) if temp.docID.find("misc.forsale") != -1: self.class1items4.append(temp.docID) self.classDocumentLookup.setdefault('4', []).append(temp.docID) if temp.docID.find("talk.politics.misc") != -1: self.class1items5.append(temp.docID) self.classDocumentLookup.setdefault('5', []).append(temp.docID) if temp.docID.find("talk.politics.guns") != -1: self.class1items5.append(temp.docID) self.classDocumentLookup.setdefault('5', []).append(temp.docID) if temp.docID.find("talk.politics.mideast") != -1: self.class1items5.append(temp.docID) self.classDocumentLookup.setdefault('5', []).append(temp.docID) if temp.docID.find("talk.religion.misc") != -1: self.class1items6.append(temp.docID) self.classDocumentLookup.setdefault('6', []).append(temp.docID) if temp.docID.find("alt.atheism") != -1: self.class1items6.append(temp.docID) self.classDocumentLookup.setdefault('6', []).append(temp.docID) if temp.docID.find("soc.religion.christian") != -1: self.class1items6.append(temp.docID) self.classDocumentLookup.setdefault('6', []).append(temp.docID) counter = counter + 1
def __delitem__( self, docid ): """Remove the document specified by ``docid`` from database""" doc = Document( self, docid ) headers = doc.head() etag = headers['Etag'][1:-1] # Strip the leading and trailing quotes doc.delete( rev=etag )