def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' self.timeStr = '00:00' val = j[0] if val is not None: if isinstance(val, str): self.dtStr = val elif isinstance(val, list): self.dtStr = val[0] m = re.search(r'(\d{1,2}:\d{2})', self.dtStr) if m is not None: self.timeStr = m.group(0) if len(self.timeStr) == 4: self.timeStr = '0'+self.timeStr self.title = '' val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) if len(self.title) > 1000: self.title = self.title[:self.title.find('\n')] # get first line in long title self.summary = '' val = j[2] if val is not None: if isinstance(val, str): self.summary = val.strip() elif isinstance(val, list): self.summary = val[0].strip() self.author = '' self.body = list() val = j[3] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) if len(self.summary) > 0 and len(locText) > 0: locText = locText.replace(self.summary,'',1).strip() # remove summary from the body, trim text = locText.strip() # trim if len(self.author) > 0: text = text.replace(self.author,'') #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine)
def __init__(self, url, j): super().__init__() self.url = '' if url is not None: self.url = url self.dtStr = '' self.timeStr = '00:00' val = j[0] if val is not None: if isinstance(val, str): self.dtStr = val elif isinstance(val, list): self.dtStr = val[0] self.title = '' val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) if len(self.title) > 1000: self.title = self.title[:self.title.find('\n')] # get first line in long title self.source = '' val = j[2] if val is not None: if isinstance(val, str): self.source = val.strip() elif isinstance(val, list): self.source = val[0].strip() self.author = '' self.body = list() val = j[3] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) #remove empty lines for line in locText.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine) self.coltype = '' val = j[4] if val is not None: locText = '' if isinstance(val, str): self.coltype = val elif isinstance(val, list): self.coltype = val[0].strip()
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' val = None if len(j) > 0: val = j[0] if val is not None: self.dtStr = val if len(self.dtStr) > 4: matchObj = re.search(r'\d{2}:\d{2}', self.dtStr) if matchObj: self.timeStr = matchObj.group() else: self.timeStr = '00:00' else: self.timeStr = '00:00' self.title = '' val = None if len(j) > 1: val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities( ' '.join(val)) self.summary = '' self.body = list() val = None if len(j) > 2: val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() # trim #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine) self.author = '' if len(j) > 3: val = j[3] if val is not None: self.author = val.strip()
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' val = None if len(j) > 0: val = j[0] if val is not None: if isinstance(val, str): self.dtStr = val elif isinstance(val, list): self.dtStr = val[0] if len(self.dtStr) > 4: self.timeStr = self.dtStr[-5:] # extract time (last five char) else: self.timeStr = '00:00' self.title = '' val = None if len(j) > 1: val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities( ' '.join(val)) self.summary = '' self.body = list() val = None if len(j) > 2: val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = val[0] text = locText.strip() # trim #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine) self.author = '' if len(j) > 3: val = j[3] if val is not None: self.author = val.strip()
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' val = j[0] if val is not None: self.dtStr = val if len(self.dtStr) > 4: self.timeStr = self.dtStr[-5:] # extract time (last five char) else: self.timeStr = '00:00' self.title = '' val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) self.summary = '' self.body = list() val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = val[0] text = locText.strip() # trim # remove html comments text = re.subn("(<!--.*?-->)", "", text, flags=re.MULTILINE | re.DOTALL)[0] if 'Версія для друку' in text: text = text[:text.find('Версія для друку')] #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine) self.author = '' if len(j) > 3: val = j[3] if val is not None: self.author = val.strip()
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' self.timeStr = '00:00' self.title = '' val = j[0] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) if len(self.title) > 1000: self.title = self.title[:self.title.find( '\n')] # get first line in long title self.summary = '' self.body = list() cutStr = 'Підписуйтесь на новини "МБ" у соцмережах' cutStr1 = 'Приєднуйтесь до "МБ" у соцмережах' cutStr2 = '\nРейтинг:\n' if len(j) > 1: val = j[1] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() # trim if cutStr in text: text = text[:text.find(cutStr)] elif cutStr1 in text: text = text[:text.find(cutStr1)] elif cutStr2 in text: text = text[:text.find(cutStr2)] # remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities( line.strip()) if len(proLine) > 0: self.body.append(proLine) self.author = ''
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' val = j[0] if val is not None: self.dtStr = val self.timeStr = '00:00' self.title = '' val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) self.summary = '' self.author = '' if len(j) > 3: val = j[3] if val is not None: self.author = val.strip() self.body = list() val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = val[0] text = locText.strip() # trim if 'Друкована версія' in text: text = text[:text.find('Друкована версія')] if len(self.author) > 0: text = text.replace(self.author, '') #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine)
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' self.timeStr = '00:00' self.title = '' val = j[0] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) if len(self.title) > 1000: self.title = self.title[:self.title.find('\n')] # get first line in long title self.summary = '' val = j[1] if val is not None: self.summary = val.strip() self.body = list() val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() # trim #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine) self.author = '' val = j[3] if val is not None: if isinstance(val, str) and len(str(val).strip())>0: self.author = str(val).strip() if (self.author.endswith(",")): self.author = self.author[:len(self.author)-1]
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' val = j[0] if val is not None: self.dtStr = val self.author = '' val = j[1] if val is not None: if isinstance(val, str): self.author = val elif isinstance(val, list): self.author = ', '.join(val) self.title = '' val = j[2] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) self.summary = '' self.body = list() val = j[3] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0: self.body.append(proLine)
def storeUrlToFile(self, url): # store file locally and do encoding convertion (xidel can't mange it correctly) f = urllib.request.urlopen(url) myfile = f.read() content = myfile.decode('windows-1251').replace( 'windows-1251', 'utf-8') content = downloader_common.relpaceHtmlEntities(content) fileName = '/tmp/' + url.replace('https://', '').replace('/', '_') + '.html' with open(fileName, "w") as fb2_file: fb2_file.write(content) return fileName
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' self.timeStr = '00:00:00' if j[0] is not None: if isinstance(j[0], str): self.dtStr = j[0] elif isinstance(j[0], list): s = str(j[0][0]) self.dtStr = s[s.find(',') + 1:].strip() + ', ' + j[0][1] self.timeStr = str(j[0][1]).strip() self.title = '' if j[1] is not None: if isinstance(j[1], str): self.title = downloader_common.relpaceHtmlEntities(j[1]) elif isinstance(j[1], list): self.title = downloader_common.relpaceHtmlEntities(j[1][0]) self.body = list() val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() # trim #remove empty lines for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0 and 'ЧИТАЙТЕ ТАКОЖ:' not in proLine: self.body.append(proLine)
def __init__(self, url, j): super().__init__() self.url = '' if url is not None: self.url = url self.dtStr = '' val = j[0] if val is not None: self.dtStr = val self.timeStr = '00:00' self.title = '' val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities(val[0]) self.body = list() val = j[2] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): locText = '\n'.join(val) text = locText.strip() # trim #remove HTML comments text = re.sub("(<!--.*?-->)", "", text, flags=re.MULTILINE|re.DOTALL) #remove empty lines and "Читайте також:" for line in text.split('\n'): proLine = downloader_common.relpaceHtmlEntities(line.strip()) if len(proLine) > 0 and not proLine.startswith('Читайте також:'): if (proLine.startswith('Відповідно до угоди, статті the Economist')): self.body.append('content deleted') else: self.body.append(proLine) self.summary = '' if len(j) > 3: val = j[3] if val is not None: self.summary = val.strip() self.author = '' if len(j) > 4: val = j[4] if val is not None: if isinstance(val, list): val = '; '.join(val) if 'Версія для друку' not in val: self.author = val.strip() if len(j) > 5 and len(self.author) < 1: val = j[5] if val is not None: if isinstance(val, list): val = '; '.join(val) self.author = val.strip()
def __init__(self, url, j): self.url = '' if url is not None: self.url = url self.dtStr = '' if j[0] is not None: self.dtStr = j[0] if len(self.dtStr) > 4: self.timeStr = self.dtStr[-5:] # extract time (last five char) else: self.timeStr = '00:00' self.title = '' val = None if len(j) > 1: val = j[1] if val is not None: if isinstance(val, str): self.title = downloader_common.relpaceHtmlEntities(val) elif isinstance(val, list): self.title = downloader_common.relpaceHtmlEntities( ' '.join(val)) self.summary = '' val = None if len(j) > 2: val = j[2] if val is not None: if isinstance(val, str): self.summary = downloader_common.relpaceHtmlEntities( val).strip() elif isinstance(val, list): self.summary = downloader_common.relpaceHtmlEntities( ' '.join(val)).strip() self.body = list() val = None if len(j) > 3: val = j[3] if val is not None: locText = '' if isinstance(val, str): locText = val elif isinstance(val, list): for line in val: if 'За матеріалами:' in line: line = line.replace('\n', '') line = ' '.join( line.split() ) #substitute multiple whitespace with single whitespace line = '@@@ ' + line locText = locText + '\n' + line if len(self.summary) > 0 and len(locText) > 0: text = locText.replace( self.summary, '', 1).strip() # remove summary from the body, trim else: #no summary text = locText.strip() # trim #remove empty lines for line in text.split('\n'): proLine = line.strip() if 'Читайте також:' in proLine: break if len(proLine) > 0: self.body.append( downloader_common.relpaceHtmlEntities(proLine))