def __load_txt(self): rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )" rn3 = r")+)" ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article rj = r'(?P<source>[ \pL"“”]+)' #regular for source rm = r"(?P<misc>.+)" #regular for misc reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE) reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE) data = [] f = open(self.filename, 'r') content = f.read() items = content.split('\n') for item in items: res = None if isEnglish(item[:15]): res = reg_eng.match(item.strip()) else: res = reg_ru.match(item.strip()) if res != None: publication = Publication() publication.authors = Author.parseAuthors(res.group("authors")) data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")}) else: print("Wrong line: " + item) return data