Example #1
0
  def __init__(self, url, j):
    self.url = ''
    if url is not None:
      self.url = url

    self.dtStr = ''
    self.timeStr = '00:00'
    val = j[0]
    if val is not None:
      if isinstance(val, str):
        self.dtStr = val
      elif isinstance(val, list):
        self.dtStr = val[0]
      m = re.search(r'(\d{1,2}:\d{2})', self.dtStr)
      if m is not None:
        self.timeStr = m.group(0)
        if len(self.timeStr) == 4:
          self.timeStr = '0'+self.timeStr

    self.title = ''
    val = j[1]
    if val is not None:
      if isinstance(val, str):
        self.title = downloader_common.relpaceHtmlEntities(val)
      elif isinstance(val, list):
        self.title = downloader_common.relpaceHtmlEntities(val[0])
      if len(self.title) > 1000:
        self.title = self.title[:self.title.find('\n')] # get first line in long title

    self.summary = ''
    val = j[2]
    if val is not None:
      if isinstance(val, str):
        self.summary = val.strip()
      elif isinstance(val, list):
        self.summary = val[0].strip()

    self.author = ''

    self.body = list()
    val = j[3]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        locText = val
      elif isinstance(val, list):
        locText = '\n'.join(val)
      if len(self.summary) > 0 and len(locText) > 0:
        locText = locText.replace(self.summary,'',1).strip() # remove summary from the body, trim
      text = locText.strip() # trim

      if len(self.author) > 0:
        text = text.replace(self.author,'')

      #remove empty lines
      for line in text.split('\n'):
        proLine = downloader_common.relpaceHtmlEntities(line.strip())
        if len(proLine) > 0:
          self.body.append(proLine)
Example #2
0
  def __init__(self, url, j):
    super().__init__()
    self.url = ''
    if url is not None:
      self.url = url

    self.dtStr = ''
    self.timeStr = '00:00'
    val = j[0]
    if val is not None:
      if isinstance(val, str):
        self.dtStr = val
      elif isinstance(val, list):
        self.dtStr = val[0]

    self.title = ''
    val = j[1]
    if val is not None:
      if isinstance(val, str):
        self.title = downloader_common.relpaceHtmlEntities(val)
      elif isinstance(val, list):
        self.title = downloader_common.relpaceHtmlEntities(val[0])
      if len(self.title) > 1000:
        self.title = self.title[:self.title.find('\n')] # get first line in long title

    self.source = ''
    val = j[2]
    if val is not None:
      if isinstance(val, str):
        self.source = val.strip()
      elif isinstance(val, list):
        self.source = val[0].strip()

    self.author = ''

    self.body = list()
    val = j[3]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        locText = val
      elif isinstance(val, list):
        locText = '\n'.join(val)

      #remove empty lines
      for line in locText.split('\n'):
        proLine = downloader_common.relpaceHtmlEntities(line.strip())
        if len(proLine) > 0:
          self.body.append(proLine)

    self.coltype = ''
    val = j[4]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        self.coltype = val
      elif isinstance(val, list):
        self.coltype = val[0].strip()
Example #3
0
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        val = None
        if len(j) > 0:
            val = j[0]

        if val is not None:
            self.dtStr = val
        if len(self.dtStr) > 4:
            matchObj = re.search(r'\d{2}:\d{2}', self.dtStr)
            if matchObj:
                self.timeStr = matchObj.group()
            else:
                self.timeStr = '00:00'
        else:
            self.timeStr = '00:00'

        self.title = ''
        val = None
        if len(j) > 1:
            val = j[1]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(
                    ' '.join(val))

        self.summary = ''

        self.body = list()
        val = None
        if len(j) > 2:
            val = j[2]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                locText = '\n'.join(val)
            text = locText.strip()  # trim

            #remove empty lines
            for line in text.split('\n'):
                proLine = downloader_common.relpaceHtmlEntities(line.strip())
                if len(proLine) > 0:
                    self.body.append(proLine)

        self.author = ''
        if len(j) > 3:
            val = j[3]
            if val is not None:
                self.author = val.strip()
Example #4
0
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        val = None
        if len(j) > 0:
            val = j[0]

        if val is not None:
            if isinstance(val, str):
                self.dtStr = val
            elif isinstance(val, list):
                self.dtStr = val[0]
        if len(self.dtStr) > 4:
            self.timeStr = self.dtStr[-5:]  # extract time (last five char)
        else:
            self.timeStr = '00:00'

        self.title = ''
        val = None
        if len(j) > 1:
            val = j[1]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(
                    ' '.join(val))

        self.summary = ''

        self.body = list()
        val = None
        if len(j) > 2:
            val = j[2]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                locText = val[0]
            text = locText.strip()  # trim

            #remove empty lines
            for line in text.split('\n'):
                proLine = downloader_common.relpaceHtmlEntities(line.strip())
                if len(proLine) > 0:
                    self.body.append(proLine)

        self.author = ''
        if len(j) > 3:
            val = j[3]
            if val is not None:
                self.author = val.strip()
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        val = j[0]
        if val is not None:
            self.dtStr = val
        if len(self.dtStr) > 4:
            self.timeStr = self.dtStr[-5:]  # extract time (last five char)
        else:
            self.timeStr = '00:00'

        self.title = ''
        val = j[1]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(val[0])

        self.summary = ''

        self.body = list()
        val = j[2]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                locText = val[0]
            text = locText.strip()  # trim

            # remove html comments
            text = re.subn("(<!--.*?-->)",
                           "",
                           text,
                           flags=re.MULTILINE | re.DOTALL)[0]

            if 'Версія для друку' in text:
                text = text[:text.find('Версія для друку')]

            #remove empty lines
            for line in text.split('\n'):
                proLine = downloader_common.relpaceHtmlEntities(line.strip())
                if len(proLine) > 0:
                    self.body.append(proLine)

        self.author = ''
        if len(j) > 3:
            val = j[3]
            if val is not None:
                self.author = val.strip()
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        self.timeStr = '00:00'

        self.title = ''
        val = j[0]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(val[0])
            if len(self.title) > 1000:
                self.title = self.title[:self.title.find(
                    '\n')]  # get first line in long title

        self.summary = ''

        self.body = list()
        cutStr = 'Підписуйтесь на новини "МБ" у соцмережах'
        cutStr1 = 'Приєднуйтесь до "МБ" у соцмережах'
        cutStr2 = '\nРейтинг:\n'
        if len(j) > 1:
            val = j[1]
            if val is not None:
                locText = ''
                if isinstance(val, str):
                    locText = val
                elif isinstance(val, list):
                    locText = '\n'.join(val)

                text = locText.strip()  # trim
                if cutStr in text:
                    text = text[:text.find(cutStr)]
                elif cutStr1 in text:
                    text = text[:text.find(cutStr1)]
                elif cutStr2 in text:
                    text = text[:text.find(cutStr2)]

                # remove empty lines
                for line in text.split('\n'):
                    proLine = downloader_common.relpaceHtmlEntities(
                        line.strip())
                    if len(proLine) > 0:
                        self.body.append(proLine)

        self.author = ''
Example #7
0
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        val = j[0]
        if val is not None:
            self.dtStr = val
        self.timeStr = '00:00'

        self.title = ''
        val = j[1]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(val[0])

        self.summary = ''

        self.author = ''
        if len(j) > 3:
            val = j[3]
            if val is not None:
                self.author = val.strip()

        self.body = list()
        val = j[2]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                locText = val[0]
            text = locText.strip()  # trim

            if 'Друкована версія' in text:
                text = text[:text.find('Друкована версія')]

            if len(self.author) > 0:
                text = text.replace(self.author, '')

            #remove empty lines
            for line in text.split('\n'):
                proLine = downloader_common.relpaceHtmlEntities(line.strip())
                if len(proLine) > 0:
                    self.body.append(proLine)
  def __init__(self, url, j):
    self.url = ''
    if url is not None:
      self.url = url

    self.dtStr = ''
    self.timeStr = '00:00'

    self.title = ''
    val = j[0]
    if val is not None:
      if isinstance(val, str):
        self.title = downloader_common.relpaceHtmlEntities(val)
      elif isinstance(val, list):
        self.title = downloader_common.relpaceHtmlEntities(val[0])
      if len(self.title) > 1000:
        self.title = self.title[:self.title.find('\n')] # get first line in long title

    self.summary = ''
    val = j[1]
    if val is not None:
      self.summary = val.strip()

    self.body = list()
    val = j[2]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        locText = val
      elif isinstance(val, list):
        locText = '\n'.join(val)

      text = locText.strip() # trim

      #remove empty lines
      for line in text.split('\n'):
        proLine = downloader_common.relpaceHtmlEntities(line.strip())
        if len(proLine) > 0:
          self.body.append(proLine)

    self.author = ''
    val = j[3]
    if val is not None:
      if isinstance(val, str) and len(str(val).strip())>0:
        self.author = str(val).strip()
        if (self.author.endswith(",")):
            self.author = self.author[:len(self.author)-1]
  def __init__(self, url, j):
    self.url = ''
    if url is not None:
      self.url = url

    self.dtStr = ''
    val = j[0]
    if val is not None:
      self.dtStr = val

    self.author = ''
    val = j[1]
    if val is not None:
      if isinstance(val, str):
        self.author = val
      elif isinstance(val, list):
        self.author = ', '.join(val)

    self.title = ''
    val = j[2]
    if val is not None:
      if isinstance(val, str):
        self.title = downloader_common.relpaceHtmlEntities(val)
    elif isinstance(val, list):
        self.title = downloader_common.relpaceHtmlEntities(val[0])

    self.summary = ''

    self.body = list()
    val = j[3]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        locText = val
      elif isinstance(val, list):
        locText = '\n'.join(val)

      text = locText.strip()

      #remove empty lines
      for line in text.split('\n'):
        proLine = downloader_common.relpaceHtmlEntities(line.strip())
        if len(proLine) > 0:
          self.body.append(proLine)
Example #10
0
 def storeUrlToFile(self, url):
     # store file locally and do encoding convertion (xidel can't mange it correctly)
     f = urllib.request.urlopen(url)
     myfile = f.read()
     content = myfile.decode('windows-1251').replace(
         'windows-1251', 'utf-8')
     content = downloader_common.relpaceHtmlEntities(content)
     fileName = '/tmp/' + url.replace('https://', '').replace('/',
                                                              '_') + '.html'
     with open(fileName, "w") as fb2_file:
         fb2_file.write(content)
     return fileName
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        self.timeStr = '00:00:00'
        if j[0] is not None:
            if isinstance(j[0], str):
                self.dtStr = j[0]
            elif isinstance(j[0], list):
                s = str(j[0][0])
                self.dtStr = s[s.find(',') + 1:].strip() + ', ' + j[0][1]
                self.timeStr = str(j[0][1]).strip()

        self.title = ''
        if j[1] is not None:
            if isinstance(j[1], str):
                self.title = downloader_common.relpaceHtmlEntities(j[1])
            elif isinstance(j[1], list):
                self.title = downloader_common.relpaceHtmlEntities(j[1][0])

        self.body = list()
        val = j[2]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                locText = '\n'.join(val)

            text = locText.strip()  # trim

            #remove empty lines
            for line in text.split('\n'):
                proLine = downloader_common.relpaceHtmlEntities(line.strip())
                if len(proLine) > 0 and 'ЧИТАЙТЕ ТАКОЖ:' not in proLine:
                    self.body.append(proLine)
  def __init__(self, url, j):
    super().__init__()
    self.url = ''
    if url is not None:
      self.url = url

    self.dtStr = ''
    val = j[0]
    if val is not None:
      self.dtStr = val
    self.timeStr = '00:00'

    self.title = ''
    val = j[1]
    if val is not None:
      if isinstance(val, str):
        self.title = downloader_common.relpaceHtmlEntities(val)
      elif isinstance(val, list):
        self.title = downloader_common.relpaceHtmlEntities(val[0])

    self.body = list()
    val = j[2]
    if val is not None:
      locText = ''
      if isinstance(val, str):
        locText = val
      elif isinstance(val, list):
        locText = '\n'.join(val)
      text = locText.strip() # trim

      #remove HTML comments
      text = re.sub("(<!--.*?-->)", "", text, flags=re.MULTILINE|re.DOTALL)

      #remove empty lines and "Читайте також:"
      for line in text.split('\n'):
        proLine = downloader_common.relpaceHtmlEntities(line.strip())
        if len(proLine) > 0 and not proLine.startswith('Читайте також:'):
          if (proLine.startswith('Відповідно до угоди, статті the Economist')):
            self.body.append('content deleted')
          else:
            self.body.append(proLine)

    self.summary = ''
    if len(j) > 3:
      val = j[3]
      if val is not None:
        self.summary = val.strip()

    self.author = ''
    if len(j) > 4:
      val = j[4]
      if val is not None:
        if isinstance(val, list):
          val = '; '.join(val)
        
        if 'Версія для друку' not in val:
          self.author = val.strip()

    if len(j) > 5 and len(self.author) < 1:
      val = j[5]
      if val is not None:
        if isinstance(val, list):
          val = '; '.join(val)
        self.author = val.strip()
    def __init__(self, url, j):
        self.url = ''
        if url is not None:
            self.url = url

        self.dtStr = ''
        if j[0] is not None:
            self.dtStr = j[0]
        if len(self.dtStr) > 4:
            self.timeStr = self.dtStr[-5:]  # extract time (last five char)
        else:
            self.timeStr = '00:00'

        self.title = ''

        val = None
        if len(j) > 1:
            val = j[1]
        if val is not None:
            if isinstance(val, str):
                self.title = downloader_common.relpaceHtmlEntities(val)
            elif isinstance(val, list):
                self.title = downloader_common.relpaceHtmlEntities(
                    ' '.join(val))

        self.summary = ''
        val = None
        if len(j) > 2:
            val = j[2]
        if val is not None:
            if isinstance(val, str):
                self.summary = downloader_common.relpaceHtmlEntities(
                    val).strip()
            elif isinstance(val, list):
                self.summary = downloader_common.relpaceHtmlEntities(
                    ' '.join(val)).strip()

        self.body = list()
        val = None
        if len(j) > 3:
            val = j[3]
        if val is not None:
            locText = ''
            if isinstance(val, str):
                locText = val
            elif isinstance(val, list):
                for line in val:
                    if 'За матеріалами:' in line:
                        line = line.replace('\n', '')
                        line = ' '.join(
                            line.split()
                        )  #substitute multiple whitespace with single whitespace
                        line = '@@@ ' + line
                    locText = locText + '\n' + line

            if len(self.summary) > 0 and len(locText) > 0:
                text = locText.replace(
                    self.summary, '',
                    1).strip()  # remove summary from the body, trim
            else:
                #no summary
                text = locText.strip()  # trim

            #remove empty lines
            for line in text.split('\n'):
                proLine = line.strip()
                if 'Читайте також:' in proLine:
                    break
                if len(proLine) > 0:
                    self.body.append(
                        downloader_common.relpaceHtmlEntities(proLine))