def extract_head( post: Publication) -> Tuple[Publication, Optional[Publication]]: if post.attachments: if len(post.plain_text) > 1024: head, tail = smart_cut(post.plain_text, 1024) head = Publication(head) tail = Publication(tail, post.attachments) return head, tail for index, attach in enumerate(post.attachments): if type(attach) is Poll: if post.plain_text: tail = Publication(attachments=[attach]) post.attachments.pop(index) if not post.plain_text and not post.attachments: post = None return post, tail else: return post, None attachments = {} for i in post.attachments: attachments.setdefault(i.type, []).append(i) if FileType.PICTURE in attachments: attachments[FileType.PICTURE].extend( attachments.get(FileType.VIDEO, [])) attachments = list(attachments.values()) head = Publication(post.plain_text, attachments.pop(0)) if attachments: tail = Publication(attachments=attachments.pop(0)) else: tail = None # господи прости меня за этот пиздец выше return head, tail
def __load_txt(self): rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )" rn3 = r")+)" ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article rj = r'(?P<source>[ \pL"“”]+)' #regular for source rm = r"(?P<misc>.+)" #regular for misc reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE) reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE) data = [] f = open(self.filename, 'r') content = f.read() items = content.split('\n') for item in items: res = None if isEnglish(item[:15]): res = reg_eng.match(item.strip()) else: res = reg_ru.match(item.strip()) if res != None: publication = Publication() publication.authors = Author.parseAuthors(res.group("authors")) data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")}) else: print("Wrong line: " + item) return data
class DBLPHandler(xml.sax.ContentHandler): def __init__(self): # super(DBLPHandler, self).__init__() xml.sax.ContentHandler.__init__(self) self.PUB_TYPES = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"] self.FIELDS = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter"] self.publication = None self.content = '' self.db = DB() self.count = 0 def startElement(self, name, attrs): if name in self.PUB_TYPES: print self.count self.count += 1 key = attrs.getValue("key") self.publication = Publication(name, key) self.content = '' if name in self.FIELDS: self.content = '' def endElement(self, name): if name in self.PUB_TYPES: self.db.dumps(self.publication) if name in self.FIELDS: self.publication.add_field(name, self.content) def characters(self, content): self.content += content.encode('utf-8').replace('\\','\\\\')
def read(self, path): txt_files = [ f for f in listdir(path) if (isfile(join(path, f)) and f.endswith(".txt")) ] self._publications = [] for f in txt_files: filename = f[:-4] text = "" with open(path + '\\' + f, encoding='utf-8') as inputfile: for row in csv.reader(inputfile): for sent in enumerate(row): if sent[0] != 0: text += ',' + sent[1].replace(u'\xa0', ' ') else: text += sent[1].replace(u'\xa0', ' ') keyphrases = [] with open(path + '\\' + filename + '.ann', encoding='utf-8') as annfile: for row in csv.reader(annfile): temp = row[0].split('\t') temp_label = temp[1].split(' ') if (len(temp)) == 3: keyphrases.append( Keyphrase(temp[0], temp_label[0], temp_label[1], temp_label[2], temp[2])) elif (len(temp)) == 2: keyphrases.append( Keyphrase(temp[0], temp_label[0], temp_label[1], temp_label[2], '.')) # the dummy surface self._publications.append(Publication(filename, text, keyphrases))
def startElement(self, name, attrs): if name in self.PUB_TYPES: print self.count self.count += 1 key = attrs.getValue("key") self.publication = Publication(name, key) self.content = '' if name in self.FIELDS: self.content = ''
def compose_publication(zipped_publication): grouped_publication = group_publication(zipped_publication) authors = Author.parse_authors(grouped_publication.get("T_AUTHOR")) source = Source(grouped_publication.get("T_JOURNAL")) misc = Misc(grouped_publication.get("T_LOCATION"), grouped_publication.get("T_PUBLISHER"), grouped_publication.get("T_YEAR"), grouped_publication.get("T_VOLUME"), grouped_publication.get("T_PAGES")) return Publication(grouped_publication.get("T_TITLE"), authors, source, misc)
def load_data(self): """load data of the conference.""" venue_urls = self.crawl_more_venue_urls() publications_urls = self.crawl_publications(venue_urls) data = { "venue urls": venue_urls, "publication urls": publications_urls, "publications": [Publication(k) for k in publications_urls] } self.data = data
def extract_head( post: Publication) -> Tuple[Publication, Optional[Publication]]: if len(post.plain_text) > 500: (head_text, tail_text) = smart_cut(post.plain_text, 500 - len(' ->')) head = Publication(head_text + ' ->') tail = Publication(tail_text, post.attachments) return head, tail if len(post.attachments) > 4: head = post.attachments[:4] tail = post.attachments[4:] head = Publication(post.plain_text, head) tail = Publication(attachments=tail) return head, tail for index, attach in enumerate(post.attachments): if type(attach) is Poll: head = Publication(attachments=[attach]) post.attachments.pop(index) if not post.plain_text and not post.attachments: post = None return head, post return post, None
def extract(self, page): """Extract all the publications info in a given result page. :param page: result page as an BeautifulSoup4 object :returns: list of Publication """ rows = page.find_all('table')[2].find_all('tr') results = [] for row in rows[1:]: cells = row.find_all('td') attrs = self.extract_attributes(cells) results.append(Publication(attrs)) return results
class Article(Publication): """ Les articles forment le coeur du site ! Ceux-ci sont publiés dans des catégories bien distinctes. """ Publication.register("Article") # on enregistre son type # --- Champs --- sous_titre = models.CharField( 'Sous titre', max_length=100, blank=True, null=True, help_text="Sous-titre d'accroche à l'article.") categorie = models.ForeignKey('Categorie', limit_choices_to={"profondeur__gt": 0}) illustration = models.ImageField( 'Image d\'illustration', upload_to="upload/articles/%Y/%m/%d", help_text= "Image qui apparaîtra à côté de l'article pour le représenter.", blank=True, null=True) mot_cles = models.CharField( 'Mot clés associés à l\'article', max_length=200, blank=True, null=True, help_text= "Liste de mot clés facilitant la recherche d'articles/dossiers liés.", db_index=True) # L'URL absolue de l'article def get_absolute_url(self): # "/[categorie...]/[label]" return u"%s/%s.html" % (self.categorie.get_absolute_url(), self.label) # L'arborescence de l'article (les catégories pour y accéder def get_arbo(self): return " > ".join([c.nom for c in self.categorie.get_parentes()]) # Classe interne Meta pour attribuer quelques options à la classe class Meta: unique_together = ( ("titre", "categorie"), ("label", "categorie") ) #le titre doit-être unique dans chaque categorie, de même que son label => autorise deux articles du même nom dans deux catégories différentes
def parse_post(self, raw: Dict[str, Any]) -> Publication: text = raw['text'] attachments = [] for a in raw.get('attachments', []): try: attachments.append(self.parse_attachment(a)) except NeedExpand as ex: text = add_line(text, ex.adding_line) except UnsupportedAttachment as ex: id = raw['id'] owner_id = raw['owner_id'] original = f"https://vk.com/wall{owner_id}_{id}" text = add_line(text, f"Unsupported attachment type '{ex.type}'." " You may want look to original: " f"{original}") if len(attachments) == 1 and type(attachments[0]) is Poll: if text == attachments[0].title: text = '' return Publication(text, attachments)
def load_data(self): """load data of author.""" resp = requests.get(params.DBLP_PERSON_URL.format(urlpt=self.urlpt)) xml = resp.content self.xml = xml root = etree.fromstring(xml) # print(etree.tostring(root, pretty_print=True)) data = { 'name': root.attrib['name'], 'publications': [ Publication(k) for k in root.xpath('/dblpperson/dblpkey[not(@type)]/text()') ], 'homepages': root.xpath('/dblpperson/dblpkey[@type="person record"]/text()'), 'homonyms': root.xpath('/dblpperson/homonym/text()') } self.data = data
def get(self, after: float): print(f"getting after {datetime.fromtimestamp(after)} ({after})") return [Publication(datetime.fromtimestamp(after).isoformat())]
class Edito(Publication): """ Les Editos apparaissent sur la page d'accueil du site. """ Publication.register("Edito") # on enregistre son type
def new(d): p = Publication() p.abstract = d['abstract'] p.authors = [author.strip() for author in d['authors']] p.gcs_file_path = d['file_path'] p.num_comments = 0 p.num_views = 0 p.num_votes = 0 p.summary = d['summary'] # p.tags = d['tags'] or [] p.title = d['title'] p.user = ndb.Key(User, d['user']) p.put() return dictify_pub(p)
def get_pub_list(count=20): return [ dictify_pub(p, True) for p in Publication.query_pubs().fetch(count) ]