def extract_head( post: Publication) -> Tuple[Publication, Optional[Publication]]: if post.attachments: if len(post.plain_text) > 1024: head, tail = smart_cut(post.plain_text, 1024) head = Publication(head) tail = Publication(tail, post.attachments) return head, tail for index, attach in enumerate(post.attachments): if type(attach) is Poll: if post.plain_text: tail = Publication(attachments=[attach]) post.attachments.pop(index) if not post.plain_text and not post.attachments: post = None return post, tail else: return post, None attachments = {} for i in post.attachments: attachments.setdefault(i.type, []).append(i) if FileType.PICTURE in attachments: attachments[FileType.PICTURE].extend( attachments.get(FileType.VIDEO, [])) attachments = list(attachments.values()) head = Publication(post.plain_text, attachments.pop(0)) if attachments: tail = Publication(attachments=attachments.pop(0)) else: tail = None # господи прости меня за этот пиздец выше return head, tail
def read(self, path): txt_files = [ f for f in listdir(path) if (isfile(join(path, f)) and f.endswith(".txt")) ] self._publications = [] for f in txt_files: filename = f[:-4] text = "" with open(path + '\\' + f, encoding='utf-8') as inputfile: for row in csv.reader(inputfile): for sent in enumerate(row): if sent[0] != 0: text += ',' + sent[1].replace(u'\xa0', ' ') else: text += sent[1].replace(u'\xa0', ' ') keyphrases = [] with open(path + '\\' + filename + '.ann', encoding='utf-8') as annfile: for row in csv.reader(annfile): temp = row[0].split('\t') temp_label = temp[1].split(' ') if (len(temp)) == 3: keyphrases.append( Keyphrase(temp[0], temp_label[0], temp_label[1], temp_label[2], temp[2])) elif (len(temp)) == 2: keyphrases.append( Keyphrase(temp[0], temp_label[0], temp_label[1], temp_label[2], '.')) # the dummy surface self._publications.append(Publication(filename, text, keyphrases))
def compose_publication(zipped_publication): grouped_publication = group_publication(zipped_publication) authors = Author.parse_authors(grouped_publication.get("T_AUTHOR")) source = Source(grouped_publication.get("T_JOURNAL")) misc = Misc(grouped_publication.get("T_LOCATION"), grouped_publication.get("T_PUBLISHER"), grouped_publication.get("T_YEAR"), grouped_publication.get("T_VOLUME"), grouped_publication.get("T_PAGES")) return Publication(grouped_publication.get("T_TITLE"), authors, source, misc)
def load_data(self): """load data of the conference.""" venue_urls = self.crawl_more_venue_urls() publications_urls = self.crawl_publications(venue_urls) data = { "venue urls": venue_urls, "publication urls": publications_urls, "publications": [Publication(k) for k in publications_urls] } self.data = data
def extract_head( post: Publication) -> Tuple[Publication, Optional[Publication]]: if len(post.plain_text) > 500: (head_text, tail_text) = smart_cut(post.plain_text, 500 - len(' ->')) head = Publication(head_text + ' ->') tail = Publication(tail_text, post.attachments) return head, tail if len(post.attachments) > 4: head = post.attachments[:4] tail = post.attachments[4:] head = Publication(post.plain_text, head) tail = Publication(attachments=tail) return head, tail for index, attach in enumerate(post.attachments): if type(attach) is Poll: head = Publication(attachments=[attach]) post.attachments.pop(index) if not post.plain_text and not post.attachments: post = None return head, post return post, None
def extract(self, page): """Extract all the publications info in a given result page. :param page: result page as an BeautifulSoup4 object :returns: list of Publication """ rows = page.find_all('table')[2].find_all('tr') results = [] for row in rows[1:]: cells = row.find_all('td') attrs = self.extract_attributes(cells) results.append(Publication(attrs)) return results
def new(d): p = Publication() p.abstract = d['abstract'] p.authors = [author.strip() for author in d['authors']] p.gcs_file_path = d['file_path'] p.num_comments = 0 p.num_views = 0 p.num_votes = 0 p.summary = d['summary'] # p.tags = d['tags'] or [] p.title = d['title'] p.user = ndb.Key(User, d['user']) p.put() return dictify_pub(p)
def parse_post(self, raw: Dict[str, Any]) -> Publication: text = raw['text'] attachments = [] for a in raw.get('attachments', []): try: attachments.append(self.parse_attachment(a)) except NeedExpand as ex: text = add_line(text, ex.adding_line) except UnsupportedAttachment as ex: id = raw['id'] owner_id = raw['owner_id'] original = f"https://vk.com/wall{owner_id}_{id}" text = add_line(text, f"Unsupported attachment type '{ex.type}'." " You may want look to original: " f"{original}") if len(attachments) == 1 and type(attachments[0]) is Poll: if text == attachments[0].title: text = '' return Publication(text, attachments)
def load_data(self): """load data of author.""" resp = requests.get(params.DBLP_PERSON_URL.format(urlpt=self.urlpt)) xml = resp.content self.xml = xml root = etree.fromstring(xml) # print(etree.tostring(root, pretty_print=True)) data = { 'name': root.attrib['name'], 'publications': [ Publication(k) for k in root.xpath('/dblpperson/dblpkey[not(@type)]/text()') ], 'homepages': root.xpath('/dblpperson/dblpkey[@type="person record"]/text()'), 'homonyms': root.xpath('/dblpperson/homonym/text()') } self.data = data
def get(self, after: float): print(f"getting after {datetime.fromtimestamp(after)} ({after})") return [Publication(datetime.fromtimestamp(after).isoformat())]