def test_remove_url_with_pre_process_text(): # pre_process_text removes trailing/extra whitespaces # thus the correct_text doesn't have one text = "text with url : https://g\nithub.com/ru\ncio/rucio" text_2 = "text with url : https://github.com/rucio/rucio" correct_text = "text with url :" assert utils.pre_process_text(text, remove_url=True) == correct_text assert utils.pre_process_text(text, remove_url=True) == correct_text
def parse( self, issue_id, comment_id, creator, created_at, body, db=Database, issue_comments_table="issue_comments", ): """ Parses a single issue's comment. :param [issue_id,...,body] : all the raw issue comment's attributes :param db : <bot Database object> to where we store the parsed issue comments :param issue_comments_table : in case we need use a different table name (default 'issue_comments') :returns issue_comment : IssueComment object """ # The date format returned from the GitHub API is in the ISO 8601 format: "%Y-%m-%dT%H:%M:%SZ" issue_comment_created_at = utils.convert_to_utc( created_at, "%Y-%m-%dT%H:%M:%SZ") issue_comment_clean_body = utils.pre_process_text(body, fix_url=True, remove_newline=True) issue_comment = IssueComment( issue_id=issue_id, comment_id=comment_id, creator=creator, created_at=issue_comment_created_at, body=body, clean_body=issue_comment_clean_body, ) db.insert_issue_comment(issue_comment, table_name=issue_comments_table) return issue_comment
def test_fix_urls_with_pre_process_text(): text = ( "text with url : https://g\nithub.com/ru\ncio/rucio that has line newline char" ) correct_text = ( "text with url : https://github.com/rucio/rucio that has line newline char" ) assert utils.pre_process_text(text, fix_url=True) == correct_text
def test_lemmatizer_with_pre_process_text(): # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >" test_words = { "rocks": "rock", "corpora": "corpus", "developers": "developer" } for word in test_words.keys(): assert utils.pre_process_text(word, lemmatize=True) == test_words[word]
def parse( self, issue_id, title, state, creator, created_at, comments, body, db=Database, issues_table_name="issues", ): """ Parses a single issue. <!> Note : The parse() method is only expected to be used after an an issues table has been created in the db. To create said table use the Database object's .create_issues_table() method before attempting to parse. :param [issue_id,...,body] : all the raw issue attributes :param db : <bot Database object> to where we store the parsed issues :param issues_table_name : in case we need use a different table name (default 'issues') :returns issue : an <Issue object> created by the IssueParser """ # The date format returned from the GitHub API is in the ISO 8601 format: "%Y-%m-%dT%H:%M:%SZ" issue_created_at = utils.convert_to_utc(created_at, "%Y-%m-%dT%H:%M:%SZ") issue_clean_body = utils.pre_process_text(self.clean_issue_body(body), fix_url=True) issue = Issue( issue_id=issue_id, title=title, state=state, creator=creator, created_at=issue_created_at, comments=comments, body=body, clean_body=issue_clean_body, ) # no comments -> no context, only insert relevant data to db if issue.comments > 0: db.insert_issue(issue, table_name=issues_table_name) return issue
def clean_body(body): """ Cleans the email's body. Applies the following: 1) Remove newline characters from inside urls 2) Replace newline characters with ' ' space 3) Remove extra whitespaces 4) Decontract words 5) Try to find matches based on the regex patterns. If said matches exist, only keep the text up to the earliest match. These patterns are inside emails right before text from previous emails is pasted/quoted. eg. Example of a reply email: "Dear Nick ... Thanks, George. On <DATE> Nick wrote: >> Previous email body >> Previous email body " from which we only keep: "Dear Nick ... Thanks, George." :param body : body of an email :returns clean_email_body : cleaned body of an email """ # steps 1-4 done with utils.pre_process_text function clean_email_body = utils.pre_process_text( body, fix_url=True, remove_newline=True ) # match the 4 regex patterns try: start_1 = start_2 = start_3 = start_4 = None if config.ON_HDR_REGEX.search(clean_email_body) is not None: for on_hdr_match in config.ON_HDR_REGEX.finditer(clean_email_body): start_1 = on_hdr_match.start() break if config.ORIGINAL_MSG_REGEX.search(clean_email_body) is not None: for or_msg_match in config.ORIGINAL_MSG_REGEX.finditer( clean_email_body ): start_2 = or_msg_match.start() break if config.QUOTED_REGEX.search(clean_email_body) is not None: for quote_match in config.QUOTED_REGEX.finditer(clean_email_body): start_3 = quote_match.start() break if config.HEADER_REGEX.search(clean_email_body) is not None: for hdr_match in config.HEADER_REGEX.finditer(clean_email_body): start_4 = hdr_match.start() break # if any matches keep text up to the earliest/first match if all(start is None for start in [start_1, start_2, start_3, start_4]): return clean_email_body else: min_start = min( start for start in [start_1, start_2, start_3, start_4] if start is not None ) clean_email_body = clean_email_body[:min_start] return clean_email_body except Exception as _e: print(_e)
def test_lower_with_pre_process_text(): text = "A SAMPLE TEXT with upper case" assert utils.pre_process_text(text, lower_text=True) == text.lower()
def test_pre_process_text_extra_whitespace(): text = " text wont contain extra spaces " correct_text = "text wont contain extra spaces" assert utils.pre_process_text(text) == correct_text
def test_stemmer_with_pre_process_text(): # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >" words = ["program", "programs", "programer", "programing", "programers"] root = "program" for word in words: assert utils.pre_process_text(word, stem=True) == root
def test_remove_stopwords_with_pre_process_text(): # nltk's word tokenizer might break tokens a bit weird eg. "->" to "- >" text = "random stopwords -> our she when or too from how am re most while will" correct_text = "random stopwords - >" assert utils.pre_process_text(text, remove_stop_words=True) == correct_text
def test_replace_numbers_with_pre_process_text(): text = "Here are all the numbers : 1234567890 ;)" correct_text = "Here are all the numbers : hhhhhhhhhh ;)" assert (utils.pre_process_text(text, remove_numbers=True, numbers_replacement="h") == correct_text)
def test_remove_numbers_with_pre_process_text(): text = "Here are all the numbers : 1234567890 ;)" correct_text = "Here are all the numbers : ;)" assert utils.pre_process_text(text, remove_numbers=True) == correct_text
def test_replace_punctuation_with_pre_process_text(): text = """Here is all the punctuation : !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~""" correct_text = "Here is all the punctuation h hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh" assert (utils.pre_process_text( text, remove_punctuation=True, punctuation_replacement="h") == correct_text)
def test_remove_punctuation_with_pre_process_text(): # remember, pre_process_text removes extra whitespaces text = """Here is all the punctuation : !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~""" correct_text = """Here is all the punctuation""" assert utils.pre_process_text(text, remove_punctuation=True) == correct_text
def test_decontract_with_pre_process_text(): text = "won't, can't, shouldn't, we're, that's, I'd, we'll, aren't, they've, I'm" correct_text = "will not, can not, should not, we are, that is, I would, we will, are not, they have, I am" assert utils.pre_process_text(text, decontract_words=True) == correct_text
def test_remove_newline_with_pre_process_text(): # is not supposed to concat words that are broken from newline char text = "text\n with multi\nple newline\n chars" correct_text = "text with multi ple newline chars" assert utils.pre_process_text(text, remove_newline=True) == correct_text