def read_docs(self, idref=None):
        if (not self.__docs):
            self.__docs = dict()
            ids_doc = []
            if idref:
                ids_doc.append([
                    '%04.3f' % (id / 1000) for id in self._id_files
                    if idref >= id
                ][-1])
            else:
                ids_doc = ['%04.3f' % (id / 1000) for id in self._id_files]
            for id in ids_doc:
                files = open(self.local_file_doc % id, 'r').read().split(
                    '********************************************')
                for line in files:
                    match = re.match(self._pattern, line, re.DOTALL)
                    if match:
                        new_line = match.groupdict()
                        if (new_line['id']):
                            d = Document(id=int(new_line['id']),
                                         text=new_line['text'])
                            #self.__docs[d.id] = d
                            yield d

        #return self.__docs.values()
        else:
            for d in self.__docs.values():
                yield d
 def read_doc(self, id):
     files = open(self.local_file_doc, 'r').read().split('.I')
     for line in files:
         match = re.match(self._pattern, line, re.DOTALL)
         if match:
             new_line = match.groupdict()
             if (new_line['id'] == str(id)):
                 return Document(id=new_line['id'], text=new_line['text'])
 def read_doc(self, id):
     if (not self.__docs):
         files = open(self.local_file_doc, 'r').read().split('/')
         for line in files:
             match = re.match(self._pattern, line, re.DOTALL)
             if match:
                 new_line = match.groupdict()
                 if(int(new_line['id'])==int(id)):
                     return Document(id=new_line['id'],text=new_line['text'])
     else:
         return self.__docs[int(id)]
Exemple #4
0
 def read_doc(self, id):
     files = open(self.local_file_doc, 'r',
                  encoding="utf-8").read().split('.I')
     for line in files:
         match = re.match(self._pattern, line, re.DOTALL)
         if match:
             new_line = match.groupdict()
             if (int(new_line['id']) == id):
                 dc = new_line['text'].strip()
                 with open(self.path + "/" + dc + ".txt") as f:
                     return Document(id=int(new_line['id']),
                                     text=f.read(),
                                     name=dc)
 def read_docs(self):
     if (not self.__docs):
         self.__docs = dict()
         files = open(self.local_file_doc, 'r').read().split('.I')
         for line in files:
             match = re.match(self._pattern, line, re.DOTALL)
             if match:
                 new_line = match.groupdict()
                 d = Document(id=new_line['id'], text=new_line['text'])
                 #self.__docs[d.id] = d
                 yield d
     else:
         for d in self.__docs.values():
             yield d
Exemple #6
0
 def read_docs(self):
     if (not self.__docs):
         self.__docs = dict()
         files = re.compile('\n\.I').split(open(self.local_file_doc, 'r').read())
         for line in files:
             match = re.match(self._pattern, line, re.DOTALL)
             if match:
                 new_line = match.groupdict()
                 d = Document(id=new_line['id'],text=new_line['title']+new_line['text'])
                 self.__docs[d.id] = d
                 yield d
             else:
                 if line.strip(): print("Not match " + line)
     else:
         for d in self.__docs.values():
             yield d
Exemple #7
0
 def read_docs(self):
     self.__docs = dict()
     files = open(self.local_file_doc, 'r',
                  encoding="utf-8").read().split('.I')
     for line in files:
         if line:
             match = re.match(self._pattern, line, re.DOTALL)
             if match:
                 new_line = match.groupdict()
                 dc = new_line['text'].strip()
                 with open(self.path + "/" + dc + ".txt",
                           encoding="utf-8") as f:
                     d = Document(id=int(new_line['id']),
                                  text=f.read(),
                                  name=dc)
                 self.__docs[d.id] = d
                 yield d
Exemple #8
0
 def read_docs(self):
     return [
         Document(id=i, text=v) for i, v in enumerate(self.local_file_doc)
     ]
Exemple #9
0
 def read_doc(self, id):
     return Document(id=id, text=self.local_file_doc[id])