Ejemplo n.º 1
0
def extract_head(
        post: Publication) -> Tuple[Publication, Optional[Publication]]:
    if post.attachments:
        if len(post.plain_text) > 1024:
            head, tail = smart_cut(post.plain_text, 1024)
            head = Publication(head)
            tail = Publication(tail, post.attachments)
            return head, tail
        for index, attach in enumerate(post.attachments):
            if type(attach) is Poll:
                if post.plain_text:
                    tail = Publication(attachments=[attach])
                    post.attachments.pop(index)
                    if not post.plain_text and not post.attachments:
                        post = None
                    return post, tail
                else:
                    return post, None

        attachments = {}
        for i in post.attachments:
            attachments.setdefault(i.type, []).append(i)
        if FileType.PICTURE in attachments:
            attachments[FileType.PICTURE].extend(
                attachments.get(FileType.VIDEO, []))
        attachments = list(attachments.values())
        head = Publication(post.plain_text, attachments.pop(0))
        if attachments:
            tail = Publication(attachments=attachments.pop(0))
        else:
            tail = None
        # господи прости меня за этот пиздец выше

        return head, tail
Ejemplo n.º 2
0
	def __load_txt(self):
		rn1 = r"(?P<authors>((\pL\. ?(\pL\. )?\pL+,? )|(\pL+ \pL\. ?(\pL\.)?,? )" #regular for authors
		rn2 = r"|(\p{Lu}\p{Ll}+ \p{Lu}\p{Ll}+,? )"
		rn3 = r")+)"
		ra_ru = r"(?P<article>\p{Lu}\p{Ll}+ \p{Ll}+.*?) *\/\/ *" #regular for article
		ra_eng = r"(?P<article>\p{Lu}.*?) *\/\/ *" #regular for article
		rj = r'(?P<source>[ \pL"“”]+)' #regular for source
		rm = r"(?P<misc>.+)" #regular for misc
		reg_ru = re.compile(rn1+rn2+rn3+ra_ru+rj+rm, re.UNICODE)
		reg_eng = re.compile(rn1+rn3+ra_eng+rj+rm, re.UNICODE)
		data = []
		f = open(self.filename, 'r')
		content = f.read()
		items = content.split('\n')
		for item in items:
			res = None
			if isEnglish(item[:15]):
				res = reg_eng.match(item.strip())
			else:
				res = reg_ru.match(item.strip())
			if res != None:
				publication = Publication()
				publication.authors = Author.parseAuthors(res.group("authors"))


				data.append({"authors": split_authors(res.group("authors")), "article": res.group("article"), "source": res.group("source"), "misc": res.group("misc")})
			else:
				print("Wrong line: " + item)
		return data
Ejemplo n.º 3
0
class DBLPHandler(xml.sax.ContentHandler):
    def __init__(self):
        # super(DBLPHandler, self).__init__()
        xml.sax.ContentHandler.__init__(self)
        self.PUB_TYPES = ["article", "inproceedings", "proceedings", "book", "incollection", "phdthesis", "mastersthesis", "www"]
        self.FIELDS = ["author", "editor", "title", "booktitle", "pages", "year", "address", "journal", "volume", "number", "month", "url", "ee", "cdrom", "cite", "publisher", "note", "crossref", "isbn", "series", "school", "chapter"]

        self.publication = None
        self.content = ''
        self.db = DB()
        self.count = 0

    def startElement(self, name, attrs):
        if name in self.PUB_TYPES:
            print self.count
            self.count += 1

            key = attrs.getValue("key")
            self.publication = Publication(name, key)
            self.content = ''
        if name in self.FIELDS:
            self.content = ''
 
    def endElement(self, name):
        if name in self.PUB_TYPES:
            self.db.dumps(self.publication)
        if name in self.FIELDS:
            self.publication.add_field(name, self.content)

    def characters(self, content):
        self.content += content.encode('utf-8').replace('\\','\\\\')
 def read(self, path):
     txt_files = [
         f for f in listdir(path)
         if (isfile(join(path, f)) and f.endswith(".txt"))
     ]
     self._publications = []
     for f in txt_files:
         filename = f[:-4]
         text = ""
         with open(path + '\\' + f, encoding='utf-8') as inputfile:
             for row in csv.reader(inputfile):
                 for sent in enumerate(row):
                     if sent[0] != 0:
                         text += ',' + sent[1].replace(u'\xa0', ' ')
                     else:
                         text += sent[1].replace(u'\xa0', ' ')
         keyphrases = []
         with open(path + '\\' + filename + '.ann',
                   encoding='utf-8') as annfile:
             for row in csv.reader(annfile):
                 temp = row[0].split('\t')
                 temp_label = temp[1].split(' ')
                 if (len(temp)) == 3:
                     keyphrases.append(
                         Keyphrase(temp[0], temp_label[0], temp_label[1],
                                   temp_label[2], temp[2]))
                 elif (len(temp)) == 2:
                     keyphrases.append(
                         Keyphrase(temp[0], temp_label[0], temp_label[1],
                                   temp_label[2], '.'))  # the dummy surface
         self._publications.append(Publication(filename, text, keyphrases))
Ejemplo n.º 5
0
    def startElement(self, name, attrs):
        if name in self.PUB_TYPES:
            print self.count
            self.count += 1

            key = attrs.getValue("key")
            self.publication = Publication(name, key)
            self.content = ''
        if name in self.FIELDS:
            self.content = ''
Ejemplo n.º 6
0
def compose_publication(zipped_publication):
    grouped_publication = group_publication(zipped_publication)
    authors = Author.parse_authors(grouped_publication.get("T_AUTHOR"))
    source = Source(grouped_publication.get("T_JOURNAL"))
    misc = Misc(grouped_publication.get("T_LOCATION"),
                grouped_publication.get("T_PUBLISHER"),
                grouped_publication.get("T_YEAR"),
                grouped_publication.get("T_VOLUME"),
                grouped_publication.get("T_PAGES"))
    return Publication(grouped_publication.get("T_TITLE"), authors, source,
                       misc)
Ejemplo n.º 7
0
    def load_data(self):
        """load data of the conference."""
        venue_urls = self.crawl_more_venue_urls()
        publications_urls = self.crawl_publications(venue_urls)

        data = {
            "venue urls": venue_urls,
            "publication urls": publications_urls,
            "publications": [Publication(k) for k in publications_urls]
        }
        self.data = data
Ejemplo n.º 8
0
def extract_head(
        post: Publication) -> Tuple[Publication, Optional[Publication]]:
    if len(post.plain_text) > 500:
        (head_text, tail_text) = smart_cut(post.plain_text, 500 - len(' ->'))
        head = Publication(head_text + ' ->')
        tail = Publication(tail_text, post.attachments)
        return head, tail
    if len(post.attachments) > 4:
        head = post.attachments[:4]
        tail = post.attachments[4:]
        head = Publication(post.plain_text, head)
        tail = Publication(attachments=tail)
        return head, tail
    for index, attach in enumerate(post.attachments):
        if type(attach) is Poll:
            head = Publication(attachments=[attach])
            post.attachments.pop(index)
            if not post.plain_text and not post.attachments:
                post = None
            return head, post
    return post, None
Ejemplo n.º 9
0
    def extract(self, page):
        """Extract all the publications info in a given result page.

        :param page: result page as an BeautifulSoup4 object
        :returns: list of Publication
        """
        rows = page.find_all('table')[2].find_all('tr')
        results = []
        for row in rows[1:]:
            cells = row.find_all('td')
            attrs = self.extract_attributes(cells)
            results.append(Publication(attrs))
        return results
Ejemplo n.º 10
0
class Article(Publication):
    """
        Les articles forment le coeur du site ! Ceux-ci sont publiés dans des catégories bien distinctes.
    """
    Publication.register("Article")  # on enregistre son type

    # --- Champs ---
    sous_titre = models.CharField(
        'Sous titre',
        max_length=100,
        blank=True,
        null=True,
        help_text="Sous-titre d'accroche à l'article.")
    categorie = models.ForeignKey('Categorie',
                                  limit_choices_to={"profondeur__gt": 0})
    illustration = models.ImageField(
        'Image d\'illustration',
        upload_to="upload/articles/%Y/%m/%d",
        help_text=
        "Image qui apparaîtra à côté de l'article pour le représenter.",
        blank=True,
        null=True)
    mot_cles = models.CharField(
        'Mot clés associés à l\'article',
        max_length=200,
        blank=True,
        null=True,
        help_text=
        "Liste de mot clés facilitant la recherche d'articles/dossiers liés.",
        db_index=True)

    # L'URL absolue de l'article
    def get_absolute_url(self):
        # "/[categorie...]/[label]"
        return u"%s/%s.html" % (self.categorie.get_absolute_url(), self.label)

    # L'arborescence de l'article (les catégories pour y accéder
    def get_arbo(self):
        return " > ".join([c.nom for c in self.categorie.get_parentes()])

    # Classe interne Meta pour attribuer quelques options à la classe
    class Meta:
        unique_together = (
            ("titre", "categorie"), ("label", "categorie")
        )  #le titre doit-être unique dans chaque categorie, de même que son label => autorise deux articles du même nom dans deux catégories différentes
Ejemplo n.º 11
0
 def parse_post(self, raw: Dict[str, Any]) -> Publication:
     text = raw['text']
     attachments = []
     for a in raw.get('attachments', []):
         try:
             attachments.append(self.parse_attachment(a))
         except NeedExpand as ex:
             text = add_line(text, ex.adding_line)
         except UnsupportedAttachment as ex:
             id = raw['id']
             owner_id = raw['owner_id']
             original = f"https://vk.com/wall{owner_id}_{id}"
             text = add_line(text, f"Unsupported attachment type '{ex.type}'."
                                   " You may want look to original: "
                                   f"{original}")
     if len(attachments) == 1 and type(attachments[0]) is Poll:
         if text == attachments[0].title:
             text = ''
     return Publication(text, attachments)
Ejemplo n.º 12
0
    def load_data(self):
        """load data of author."""
        resp = requests.get(params.DBLP_PERSON_URL.format(urlpt=self.urlpt))

        xml = resp.content
        self.xml = xml
        root = etree.fromstring(xml)
        # print(etree.tostring(root, pretty_print=True))
        data = {
            'name':
            root.attrib['name'],
            'publications': [
                Publication(k)
                for k in root.xpath('/dblpperson/dblpkey[not(@type)]/text()')
            ],
            'homepages':
            root.xpath('/dblpperson/dblpkey[@type="person record"]/text()'),
            'homonyms':
            root.xpath('/dblpperson/homonym/text()')
        }

        self.data = data
Ejemplo n.º 13
0
 def get(self, after: float):
     print(f"getting after {datetime.fromtimestamp(after)} ({after})")
     return [Publication(datetime.fromtimestamp(after).isoformat())]
Ejemplo n.º 14
0
class Edito(Publication):
    """
        Les Editos apparaissent sur la page d'accueil du site.
    """
    Publication.register("Edito")  # on enregistre son type
Ejemplo n.º 15
0
def new(d):
    p = Publication()
    p.abstract = d['abstract']
    p.authors = [author.strip() for author in d['authors']]
    p.gcs_file_path = d['file_path']
    p.num_comments = 0
    p.num_views = 0
    p.num_votes = 0
    p.summary = d['summary']
    # p.tags = d['tags'] or []
    p.title = d['title']
    p.user = ndb.Key(User, d['user'])
    p.put()
    return dictify_pub(p)
Ejemplo n.º 16
0
def get_pub_list(count=20):
    return [
        dictify_pub(p, True) for p in Publication.query_pubs().fetch(count)
    ]