Example #1
0
def extract_head(
        post: Publication) -> Tuple[Publication, Optional[Publication]]:
    if post.attachments:
        if len(post.plain_text) > 1024:
            head, tail = smart_cut(post.plain_text, 1024)
            head = Publication(head)
            tail = Publication(tail, post.attachments)
            return head, tail
        for index, attach in enumerate(post.attachments):
            if type(attach) is Poll:
                if post.plain_text:
                    tail = Publication(attachments=[attach])
                    post.attachments.pop(index)
                    if not post.plain_text and not post.attachments:
                        post = None
                    return post, tail
                else:
                    return post, None

        attachments = {}
        for i in post.attachments:
            attachments.setdefault(i.type, []).append(i)
        if FileType.PICTURE in attachments:
            attachments[FileType.PICTURE].extend(
                attachments.get(FileType.VIDEO, []))
        attachments = list(attachments.values())
        head = Publication(post.plain_text, attachments.pop(0))
        if attachments:
            tail = Publication(attachments=attachments.pop(0))
        else:
            tail = None
        # господи прости меня за этот пиздец выше

        return head, tail
 def read(self, path):
     txt_files = [
         f for f in listdir(path)
         if (isfile(join(path, f)) and f.endswith(".txt"))
     ]
     self._publications = []
     for f in txt_files:
         filename = f[:-4]
         text = ""
         with open(path + '\\' + f, encoding='utf-8') as inputfile:
             for row in csv.reader(inputfile):
                 for sent in enumerate(row):
                     if sent[0] != 0:
                         text += ',' + sent[1].replace(u'\xa0', ' ')
                     else:
                         text += sent[1].replace(u'\xa0', ' ')
         keyphrases = []
         with open(path + '\\' + filename + '.ann',
                   encoding='utf-8') as annfile:
             for row in csv.reader(annfile):
                 temp = row[0].split('\t')
                 temp_label = temp[1].split(' ')
                 if (len(temp)) == 3:
                     keyphrases.append(
                         Keyphrase(temp[0], temp_label[0], temp_label[1],
                                   temp_label[2], temp[2]))
                 elif (len(temp)) == 2:
                     keyphrases.append(
                         Keyphrase(temp[0], temp_label[0], temp_label[1],
                                   temp_label[2], '.'))  # the dummy surface
         self._publications.append(Publication(filename, text, keyphrases))
Example #3
0
def compose_publication(zipped_publication):
    grouped_publication = group_publication(zipped_publication)
    authors = Author.parse_authors(grouped_publication.get("T_AUTHOR"))
    source = Source(grouped_publication.get("T_JOURNAL"))
    misc = Misc(grouped_publication.get("T_LOCATION"),
                grouped_publication.get("T_PUBLISHER"),
                grouped_publication.get("T_YEAR"),
                grouped_publication.get("T_VOLUME"),
                grouped_publication.get("T_PAGES"))
    return Publication(grouped_publication.get("T_TITLE"), authors, source,
                       misc)
Example #4
0
    def load_data(self):
        """load data of the conference."""
        venue_urls = self.crawl_more_venue_urls()
        publications_urls = self.crawl_publications(venue_urls)

        data = {
            "venue urls": venue_urls,
            "publication urls": publications_urls,
            "publications": [Publication(k) for k in publications_urls]
        }
        self.data = data
Example #5
0
def extract_head(
        post: Publication) -> Tuple[Publication, Optional[Publication]]:
    if len(post.plain_text) > 500:
        (head_text, tail_text) = smart_cut(post.plain_text, 500 - len(' ->'))
        head = Publication(head_text + ' ->')
        tail = Publication(tail_text, post.attachments)
        return head, tail
    if len(post.attachments) > 4:
        head = post.attachments[:4]
        tail = post.attachments[4:]
        head = Publication(post.plain_text, head)
        tail = Publication(attachments=tail)
        return head, tail
    for index, attach in enumerate(post.attachments):
        if type(attach) is Poll:
            head = Publication(attachments=[attach])
            post.attachments.pop(index)
            if not post.plain_text and not post.attachments:
                post = None
            return head, post
    return post, None
Example #6
0
    def extract(self, page):
        """Extract all the publications info in a given result page.

        :param page: result page as an BeautifulSoup4 object
        :returns: list of Publication
        """
        rows = page.find_all('table')[2].find_all('tr')
        results = []
        for row in rows[1:]:
            cells = row.find_all('td')
            attrs = self.extract_attributes(cells)
            results.append(Publication(attrs))
        return results
def new(d):
    p = Publication()
    p.abstract = d['abstract']
    p.authors = [author.strip() for author in d['authors']]
    p.gcs_file_path = d['file_path']
    p.num_comments = 0
    p.num_views = 0
    p.num_votes = 0
    p.summary = d['summary']
    # p.tags = d['tags'] or []
    p.title = d['title']
    p.user = ndb.Key(User, d['user'])
    p.put()
    return dictify_pub(p)
Example #8
0
 def parse_post(self, raw: Dict[str, Any]) -> Publication:
     text = raw['text']
     attachments = []
     for a in raw.get('attachments', []):
         try:
             attachments.append(self.parse_attachment(a))
         except NeedExpand as ex:
             text = add_line(text, ex.adding_line)
         except UnsupportedAttachment as ex:
             id = raw['id']
             owner_id = raw['owner_id']
             original = f"https://vk.com/wall{owner_id}_{id}"
             text = add_line(text, f"Unsupported attachment type '{ex.type}'."
                                   " You may want look to original: "
                                   f"{original}")
     if len(attachments) == 1 and type(attachments[0]) is Poll:
         if text == attachments[0].title:
             text = ''
     return Publication(text, attachments)
Example #9
0
    def load_data(self):
        """load data of author."""
        resp = requests.get(params.DBLP_PERSON_URL.format(urlpt=self.urlpt))

        xml = resp.content
        self.xml = xml
        root = etree.fromstring(xml)
        # print(etree.tostring(root, pretty_print=True))
        data = {
            'name':
            root.attrib['name'],
            'publications': [
                Publication(k)
                for k in root.xpath('/dblpperson/dblpkey[not(@type)]/text()')
            ],
            'homepages':
            root.xpath('/dblpperson/dblpkey[@type="person record"]/text()'),
            'homonyms':
            root.xpath('/dblpperson/homonym/text()')
        }

        self.data = data
Example #10
0
 def get(self, after: float):
     print(f"getting after {datetime.fromtimestamp(after)} ({after})")
     return [Publication(datetime.fromtimestamp(after).isoformat())]