Example #1
0
    def __init__(self, filename):
        self.docs = []

        cf = open(filename)
        docid = ''
        title = ''
        author = ''
        body = ''

        for line in cf:
            if '.I' in line:
                if docid != '':
                    body = buf
                    self.docs.append(Document(docid, title, author, body))
                # start a new document
                docid = line.strip().split()[1]
                buf = ''
            elif '.T' in line:
                None
            elif '.A' in line:
                title = buf  # got title
                buf = ''
            elif '.B' in line:
                author = buf  # got author
                buf = ''
            elif '.W' in line:
                buf = ''  # skip affiliation
            else:
                buf += line

        self.docs.append(Document(docid, title, author, buf))  # the last one
Example #2
0
    def __init__(self, filename):
        self.docs = []
        self.collection = Collection()

        cf = open(filename)
        docid = ''
        title = ''
        author = ''
        body = ''

        for line in cf:
            if '.I' in line:
                if docid != '':
                    body = buf
                    self.docs.append(Document(docid, title, author, body))
                # start a new document
                docid = line.strip().split()[1]
                buf = ''
            elif '.T' in line:
                None
            elif '.A' in line:
                title = buf  # got title
                buf = ''
            elif '.B' in line:
                author = buf  # got author
                buf = ''
            elif '.W' in line:
                buf = ''  # skip affiliation
            else:
                buf += line
        self.docs.append(Document(docid, title, author, buf))  # the last one

        # Create the document Collection
        for i in range(len(self.docs)):
            self.collection.add(self.docs[i].docID, self.docs[i])
    def __init__(self, filename):
        self.docs = []

        cf = open(filename)
        docid = ""
        title = ""
        author = ""
        body = ""

        for line in cf:
            if ".I" in line:
                if docid != "":
                    body = buf
                    self.docs.append(Document(docid, title, author, body))
                # start a new document
                docid = line.strip().split()[1]
                buf = ""
            elif ".T" in line:
                None
            elif ".A" in line:
                title = buf  # got title
                buf = ""
            elif ".B" in line:
                author = buf  # got author
                buf = ""
            elif ".W" in line:
                buf = ""  # skip affiliation
            else:
                buf += line
        self.docs.append(Document(docid, title, author, buf))  # the last one
Example #4
0
	def __read_yaml(self):
		with open(self.yaml) as f:
			contents = f.read()
		
		pattern = re.compile('^\#include[\s]+([\S]+)[\s]*$', re.MULTILINE)
		contents = pattern.sub(self.__yaml_include, contents)
		
		doc = Document(self, yaml.load(contents))
		doc.save()
Example #5
0
 def __contains__( self, docid ):
     """Return whether the document identified by ``docid`` is present in
     the database.
     """
     try :
         doc = Document( self, docid )
         doc.head()
         return True
     except :
         return False
Example #6
0
 def readfiles(self, dirname, filename, subdir):
     #read file subject and last xx lines
     filepath = dirname + "\\" + filename
     cf = open(filepath)
     docid = filename + "_" + subdir
     number_of_lines = 0
     title = ''
     body = ''
     linemessage = ''
     startlines = False
     for line in cf:
         if 'Subject:' in line:
             title = line[9:].strip()  # got title
         elif 'Lines:' in line:
             try:
                 number_of_lines = int(line[6:])
             except Exception as e:
                 if 'dog' in str(e):
                     number_of_lines = 24
             startlines = True
             line = ''
         if startlines:
             #last_line = cf.readlines()[-number_of_lines:]
             last_line = [
                 i.replace('\n', '')
                 for i in cf.readlines()[-number_of_lines:]
             ]
             linemessage = ''.join(last_line)
     body = linemessage
     #convert file to document format
     self.docs.append(Document(docid, title, body))
Example #7
0
def read_id_file_into_docs(file):
    docs = list()

    with open(file, mode="r", encoding="utf-8") as f:
        with click.progressbar(f.readlines(), label="Reading file") as bar:
            for l in bar:
                data = l.split("\t")
                if len(data) == 2:
                    docs.append(Document(data[0], "", data[1], ""))
                else:
                    print("NoNoNo")
    return docs
Example #8
0
def read_pdf(path: str) -> List[Document]:
    page_docs = []
    with open(path, 'rb') as f:
        reader = PdfFileReader(f)

        for i, page in enumerate(reader.pages):
            doc = Document(name=ntpath.basename(path),
                           page=i + 1,
                           content=page.extractText())
            page_docs.append(doc)

    return page_docs
Example #9
0
def main(file):
    doc = Document(file)
    input = open(file).read()
    border = re.compile('^-{20,}$', re.MULTILINE)
    if re.search(border, input):
        intro, text = re.split(border, input, 1)
    else:
        intro, text = "", input
    doc.init(intro)
    try:
        doc.render(text)
    except PressError as e:
        e.report()
    doc.save()
Example #10
0
	def addall(self, dirpath):
		dlist = os.listdir(dirpath)
		flist = []

		for tmp in dlist:
			tmppath = os.path.join(dirpath, tmp)
			if os.path.isfile(tmppath):
				flist.append(tmp)
			elif os.path.isdir(tmppath):
				self.addall(tmppath)

		for f in flist:
			doc = Document(os.path.join(dirpath, f))
			self.add(doc)
Example #11
0
    def build(self, dirpath):
        dlist = os.listdir(dirpath)
        flist = []

        self.collect_files(dirpath, flist)

        for f in flist:
            doc = Document(os.path.join(dirpath, f))
            tokens = tools.tokenize(doc.content, self.onlyalpha,
                                    self.stopwords, self.stemmer)
            self.vocab.addall(tokens)

        self.itable = itable.IndexTable(self.vocab, self.onlyalpha,
                                        self.stopwords, self.stemmer)
        self.itable.addall(dirpath)
        self.itable.compute_tfidf()
Example #12
0
def correct_task(version=1):
    """ Correct text provided by the user, i.e. not coming from an article.
        This can be either an uploaded file or a string.
        This is a lower level API used by the Greynir web front-end. """
    if not (1 <= version <= 1):
        return better_jsonify(valid=False, reason="Unsupported version")

    file = request.files.get("file")
    if file is not None:
        # Handle uploaded file
        # file is a proxy object that emulates a Werkzeug FileStorage object
        mimetype = file.mimetype
        if mimetype not in SUPPORTED_DOC_MIMETYPES:
            return better_jsonify(valid=False, reason="File type not supported")

        # Create document object from an uploaded file and extract its text
        try:
            # Instantiate an appropriate class for the MIME type of the file
            doc = Document.for_mimetype(mimetype)(file.read())
            text = doc.extract_text()
        except Exception as e:
            logging.warning("Exception in correct_task(): {0}".format(e))
            return better_jsonify(valid=False, reason="Error reading file")

    else:

        # Handle POSTed form data or plain text string
        try:
            text = text_from_request(request)
        except Exception as e:
            logging.warning("Exception in correct_task(): {0}".format(e))
            return better_jsonify(valid=False, reason="Invalid request")

    # assert isinstance(request, _RequestProxy)
    pgs, stats = check_grammar(text, progress_func=cast(Any, request).progress_func)

    # Return the annotated paragraphs/sentences and stats
    # in a JSON structure to the client
    return better_jsonify(valid=True, result=pgs, stats=stats, text=text)
Example #13
0
    def __init__(self, files):
        self.docs = []
        self.class1items1 = []
        self.class1items2 = []
        self.class1items3 = []
        self.class1items4 = []
        self.class1items5 = []
        self.class1items6 = []
        #find class id based on document number
        self.classDocumentLookup ={}

        for filename in files:
            newsGroupFile = open(filename)
            head, tail = os.path.split(filename)
            if head.find("comp.graphics") != -1:
                docid = tail+"comp.graphics"
            if head.find("comp.os.ms-windows.misc")!= -1:
                docid = tail + "comp.os.ms-windows.misc"
            if head.find("comp.sys.ibm.pc.hardware")!= -1:
                docid = tail + "comp.sys.ibm.pc.hardware"
            if head.find("comp.sys.mac.hardware")!= -1:
                docid = tail + "comp.sys.mac.hardware"
            if head.find("comp.windows.x")!= -1:
                docid = tail + "comp.windows.x"
            if head.find("rec.autos")!= -1:
                docid = tail + "rec.autos"
            if head.find("rec.motorcycles")!= -1:
                docid = tail + "rec.motorcycles"
            if head.find("rec.sport.baseball")!= -1:
                docid = tail + "rec.sport.baseball"
            if head.find("rec.sport.hockey")!= -1:
                docid = tail + "rec.sport.hockey"
            if head.find("sci.crypt")!= -1:
                docid = tail + "sci.crypt"
            if head.find("sci.electronics")!= -1:
                docid = tail + "sci.electronics"
            if head.find("sci.med")!= -1:
                docid = tail + "sci.med"
            if head.find("sci.space")!= -1:
                docid = tail + "sci.space"
            if head.find("misc.forsale")!= -1:
                docid = tail + "misc.forsale"
            if head.find("talk.politics.misc")!= -1:
                docid = tail + "talk.politics.misc"
            if head.find("talk.politics.guns")!= -1:
                docid = tail + "talk.politics.guns"
            if head.find("talk.politics.mideast")!= -1:
                docid = tail + "talk.politics.mideast"
            if head.find("talk.religion.misc")!= -1:
                docid = tail + "talk.religion.misc"
            if head.find("alt.atheism")!= -1:
                docid = tail + "alt.atheism"
            if head.find("soc.religion.christian")!= -1:
                docid = tail + "soc.religion.christian"



            subject = ''
            message = ''
            startread = False
            buf = ''
            for line in newsGroupFile:
    #            print (line)

                if 'Subject:' in line:
                    subject = line[9:] # got title
                elif 'Lines:' in line:
                    startread = True
                    line=''
                if startread:
                      buf += line


            message=buf;
            self.docs.append(Document(docid, subject, message)) # the last one
        counter = 0
        for temp in self.docs:

            if temp.docID.find("comp.graphics") != -1:
                self.class1items1.append(temp.docID)
                self.classDocumentLookup.setdefault('1', []).append(temp.docID)
            if temp.docID.find("comp.os.ms-windows.misc") != -1:
                self.class1items1.append(temp.docID)
                self.classDocumentLookup.setdefault('1', []).append(temp.docID)
            if temp.docID.find("comp.sys.ibm.pc.hardware") != -1:
                self.class1items1.append(temp.docID)
                self.classDocumentLookup.setdefault('1', []).append(temp.docID)
            if temp.docID.find("comp.sys.mac.hardware") != -1:
                self.class1items1.append(temp.docID)
                self.classDocumentLookup.setdefault('1', []).append(temp.docID)
            if temp.docID.find("comp.windows.x") != -1:
                self.class1items1.append(temp.docID)
                self.classDocumentLookup.setdefault('1', []).append(temp.docID)

            if temp.docID.find("rec.autos") != -1:
                self.class1items2.append(temp.docID)
                self.classDocumentLookup.setdefault('2', []).append(temp.docID)
            if temp.docID.find("rec.motorcycles") != -1:
                self.class1items2.append(temp.docID)
                self.classDocumentLookup.setdefault('2', []).append(temp.docID)
            if temp.docID.find("rec.sport.baseball") != -1:
                self.class1items2.append(temp.docID)
                self.classDocumentLookup.setdefault('2', []).append(temp.docID)
            if temp.docID.find("rec.sport.hockey") != -1:
                self.class1items2.append(temp.docID)
                self.classDocumentLookup.setdefault('2', []).append(temp.docID)

            if temp.docID.find("sci.crypt") != -1:
                self.class1items3.append(temp.docID)
                self.classDocumentLookup.setdefault('3', []).append(temp.docID)
            if temp.docID.find("sci.electronics") != -1:
                self.class1items3.append(temp.docID)
                self.classDocumentLookup.setdefault('3', []).append(temp.docID)
            if temp.docID.find("sci.med") != -1:
                self.class1items3.append(temp.docID)
                self.classDocumentLookup.setdefault('3', []).append(temp.docID)
            if temp.docID.find("sci.space") != -1:
                self.class1items3.append(temp.docID)
                self.classDocumentLookup.setdefault('3', []).append(temp.docID)

            if temp.docID.find("misc.forsale") != -1:
                self.class1items4.append(temp.docID)
                self.classDocumentLookup.setdefault('4', []).append(temp.docID)

            if temp.docID.find("talk.politics.misc") != -1:
                self.class1items5.append(temp.docID)

                self.classDocumentLookup.setdefault('5', []).append(temp.docID)
            if temp.docID.find("talk.politics.guns") != -1:
                self.class1items5.append(temp.docID)
                self.classDocumentLookup.setdefault('5', []).append(temp.docID)
            if temp.docID.find("talk.politics.mideast") != -1:
                self.class1items5.append(temp.docID)
                self.classDocumentLookup.setdefault('5', []).append(temp.docID)

            if temp.docID.find("talk.religion.misc") != -1:
                self.class1items6.append(temp.docID)
                self.classDocumentLookup.setdefault('6', []).append(temp.docID)
            if temp.docID.find("alt.atheism") != -1:
                self.class1items6.append(temp.docID)
                self.classDocumentLookup.setdefault('6', []).append(temp.docID)
            if temp.docID.find("soc.religion.christian") != -1:
                self.class1items6.append(temp.docID)
                self.classDocumentLookup.setdefault('6', []).append(temp.docID)
            counter = counter + 1
Example #14
0
 def __delitem__( self, docid ):
     """Remove the document specified by ``docid`` from database"""
     doc =  Document( self, docid )
     headers = doc.head()
     etag = headers['Etag'][1:-1]    # Strip the leading and trailing quotes
     doc.delete( rev=etag )