def test_parse_orgmode_list(self): org = orgparse.loads('''#+STARTUP: showall - Lorem ipsum foo. <2019-01-17 Thu> - bar <2019-01-18 Fri 11:30> - spam [2021-05-13 Thu] ''') subprovider = 'my_provider' result = list(parse_orgmode_list(org, subprovider)) expected = [ Item.normalized( datetime_=datetime.datetime(2019, 1, 17), text='Lorem ipsum foo.', provider='orgmodelist', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 18, 11, 30), text='bar', provider='orgmodelist', subprovider=subprovider, all_day=False, ), Item.normalized( datetime_=datetime.datetime(2021, 5, 13), text='spam', provider='orgmodelist', subprovider=subprovider, all_day=True, ), ] self.assertListEqual(result, expected)
def parse_orgmode(f: IO, subprovider: str) -> Iterator[Item]: current_datetime: Optional[datetime.datetime] = None current_paragraph: List[str] = [] lines = peekable(f) for line in lines: line_clean = line.strip() if line_clean: m = regex_heading.match(line_clean) # Title line if m: if m.group('todo'): current_datetime = None else: current_datetime = datetime.datetime.strptime( m.group('date'), '%Y-%m-%d %a') # Paragraph line but not before first heading elif current_datetime: current_paragraph.append(line_clean) # Empty line after paragraph or last line of file if not line_clean or not lines: if current_datetime and current_paragraph: yield Item.normalized( datetime_=current_datetime, text='\n'.join(current_paragraph), provider=provider, subprovider=subprovider, all_day=True, ) current_paragraph.clear()
def _parse_events(events_data: Iterable[str], subprovider: str) -> Iterator[Item]: for event_data in events_data: lines = io.StringIO(event_data) for event in parse_calendar(lines): yield Item.normalized( datetime_=event.begin, text=event.name, provider=provider, subprovider=subprovider, all_day=event.all_day, )
def main(config: dict, *args, **kwargs) -> Iterator[Item]: path = config['path'] username = config['username'] logger.info('Reading Facebook archive %s', path) soup = _read_html(path) for status in _parse_timeline_page(soup): yield Item.normalized( datetime_=status.datetime_, text=status.text, provider=provider, subprovider=username, )
def _parse_ratings_pages( soups: Iterable[BeautifulSoup], subprovider: str ) -> Iterator[Item]: for soup in soups: for film in _parse_ratings_page(soup): yield Item.normalized( datetime_=film.datetime_, text=film.title, provider=provider, subprovider=subprovider, all_day=True, )
def _parse_tweets_file(path: Path) -> Iterator[Item]: with path.open() as f: f.readline() # Skip first line, which is not JSOn tweets_data = json.load(f) for tweet_data in tweets_data: datetime_ = datetime.datetime.strptime(tweet_data['created_at'], '%Y-%m-%d %H:%M:%S %z') text = tweet_data['text'] screen_name = tweet_data['user']['screen_name'] yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=screen_name, )
def main(config: dict, *args, **kwargs) -> Iterator[Item]: paths = config['paths'] unique_events: List[Event] = [] for path_str in paths: path = Path(path_str) subprovider = path.name for event in _read_calendar(path): if event not in unique_events: yield Item.normalized( datetime_=event.begin, text=event.name, provider=provider, subprovider=subprovider, all_day=event.all_day, ) unique_events.append(event)
def _read_git_logs(repo_paths: Iterable[str], author: str) -> Iterator[Item]: for repo_path in repo_paths: logger.info('Reading repository %s', repo_path) repo_name = os.path.basename(repo_path) try: log = _call_git_log(repo_path, author) except subprocess.CalledProcessError: continue for log_line in log.splitlines(): formatted_datetime_, text = log_line.split(',', maxsplit=1) datetime_ = datetime.datetime.fromisoformat(formatted_datetime_) yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=repo_name, )
def main(config: dict, *args, **kwargs) -> Iterator[Item]: path = Path(config['path']) subprovider = path.name logger.info('Reading todo.txt file %s', path) with path.open() as f: for line in f: m = regex_line.match(line) if not m: continue datetime_ = datetime.datetime( int(m.group('y')), int(m.group('m')), int(m.group('d')) ) text = _clean_text(m.group('text')) yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=subprovider, all_day=True, )
def main(config: dict, *args, **kwargs) -> Iterator[Item]: path = Path(config['path']) subprovider = path.name logger.info('Reading CSV file %s', path) renderer = pystache.Renderer(escape=lambda u: u) date_source_tmpl = pystache.parse(config['date_source']) text_source_tmpl = pystache.parse(config['text_source']) with path.open() as f: reader = csv.DictReader(f) for row in reader: text = renderer.render(text_source_tmpl, row) datetime_ = datetime.datetime.strptime( renderer.render(date_source_tmpl, row), config['date_format']) yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=subprovider, all_day=True, )
def _read_messages(pathname: str, sent: bool) -> Iterator[Item]: for path in glob.glob(pathname): logger.info('Reading message %s', path) with open(path, 'rb') as f: email_message = email.message_from_binary_file(f) if not email_message['Date']: logger.warning('Skipping message without date: %s', path) continue datetime_ = _parse_date(email_message['Date']) text = _format_text( _parse_address(email_message['From']), _parse_address(email_message['To']), _decode_header(email_message['Subject']), sent, ) yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=pathname, )
def parse_orgmode_list(org: orgparse.OrgNode, subprovider: str) -> Iterator[Item]: for line in org.root.body.splitlines(): if not line or line.startswith('#'): continue m = regex_item.search(line) if not m: raise OrgModeError(f'Unknow format of line "{line}"') text = m.group('text') date_str = m.group('date') datetime_ = dateparser.parse(date_str) if not datetime_: logger.warn('Failed to parse date "%s"', date_str) continue all_day = not any((datetime_.hour, datetime_.minute, datetime_.second)) yield Item.normalized( datetime_=datetime_, text=text, provider=provider, subprovider=subprovider, all_day=all_day, )
def test_parse_txt(self): f = io.StringIO('''2019-01-17 Čt foo bar baz baz 2019-01-18 Pá 2019-01-19 one two foo three four bar baz spam lorem ''') subprovider = 'my_provider' result = list(parse_txt(f, subprovider)) expected = [ Item.normalized( datetime_=datetime.datetime(2019, 1, 17), text='foo', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 17), text='bar: baz baz', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='one: two: foo', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='one: two: three four bar', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='one: two: baz', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='one: spam', provider='txt', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='lorem', provider='txt', subprovider=subprovider, all_day=True, ), ] self.assertListEqual(result, expected)
def _read_items(csv_path: str) -> Iterator[Item]: with open(csv_path) as f: reader = csv.reader(f) for row in reader: yield Item.from_tuple(row)
def test_parse_orgmode(self): f = io.StringIO('''#+STARTUP: showall * <2019-01-17 Thu> foo bar two empty lines are okay * <2019-01-18 Fri> missing empty line is okay * <2019-01-19 Sat> something something else - with - a - list * TODO <2019-01-20 Sun> ignore this ''') subprovider = 'my_provider' result = list(parse_orgmode(f, subprovider)) expected = [ Item.normalized( datetime_=datetime.datetime(2019, 1, 17), text='foo\nbar', provider='orgmode', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 17), text='two empty lines are okay', provider='orgmode', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 18), text='missing empty line is okay', provider='orgmode', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='something', provider='orgmode', subprovider=subprovider, all_day=True, ), Item.normalized( datetime_=datetime.datetime(2019, 1, 19), text='something else\n- with\n- a\n- list', provider='orgmode', subprovider=subprovider, all_day=True, ), ] self.assertListEqual(result, expected)
def parse_txt( f: IO, subprovider: str, indent_spaces: int = 4, sep: str = ': ', max_indent: int = 3, sep_after_max_indent: str = ' ', ) -> Iterator[Item]: current_datetime: Optional[datetime.datetime] = None stack: List[str] = [] lines = peekable(f) for line in lines: line_clean = line.rstrip() if not line_clean: continue # Title line m = regex_heading.match(line_clean) if m: if stack: if not current_datetime: raise ValueError('No date found') text = sep.join(stack) yield Item.normalized( datetime_=current_datetime, text=text, provider=provider, subprovider=subprovider, all_day=True, ) stack.clear() date_str = m.group('date') current_datetime = datetime.datetime.strptime(date_str, '%Y-%m-%d') # Starts with a non-date line elif not current_datetime: raise ValueError('No date found') # Content line else: m = regex_content.match(line_clean) if not m: raise ValueError(f'Misformatted line "{line_clean}"') indent_len = len(m.group('indent')) if indent_len % indent_spaces != 0: raise ValueError( f'Indent not a multiple of {indent_spaces} ' f'"{line_clean}"' ) indent_size = indent_len / indent_spaces raw_text = m.group('text') if indent_size > max_indent: indent_size = max_indent stack[-1] = sep_after_max_indent.join([stack[-1], raw_text]) continue if indent_size <= len(stack): text = sep.join(stack) yield Item.normalized( datetime_=current_datetime, text=text, provider=provider, subprovider=subprovider, all_day=True, ) if indent_size < len(stack): stack.pop() stack.pop() stack.append(raw_text) if not lines and stack: text = sep.join(stack) yield Item.normalized( datetime_=current_datetime, text=text, provider=provider, subprovider=subprovider, all_day=True, )