def getBadOfWords(sentence): metapy.log_to_stderr() doc = metapy.index.Document() # doc.content("I said that I can't believe that it only costs $19.95!") doc.content(sentence) tok = metapy.analyzers.ICUTokenizer(suppress_tags=True) tok.set_content(doc.content()) # this could be any string # Here, we can see that the LengthFilter is consuming our original ICUTokenizer. # It modifies the token stream by only emitting tokens # that are of a minimum length of 2 and a maximum length of 30. tok = metapy.analyzers.LengthFilter(tok, min=2, max=30) tok.set_content(doc.content()) # this could be any string # Stopword removal and stemming tok = metapy.analyzers.ListFilter(tok, "lemur-stopwords.txt", metapy.analyzers.ListFilter.Type.Reject) tok.set_content(doc.content()) tokens = [token for token in tok] cleanSentence = "" for word in tokens: cleanSentence += word cleanSentence += " " return cleanSentence
def test_upload_submission(self): metapy.log_to_stderr() """ This is the unit test that actually submits the results to the leaderboard. If there is an error (on either end of the submission), the unit test is failed, and the failure string is also reproduced on the leaderboard. """ req = { 'token': os.environ.get('GITLAB_API_TOKEN'), 'alias': os.environ.get('COMPETITION_ALIAS') or 'Anonymous', 'results': [] } for cfg_file in self.cfgs: res = {'error': None} with open(cfg_file, 'r') as fin: cfg_d = pytoml.load(fin) res['dataset'] = cfg_d['dataset'] print("\nRunning on {}...".format(res['dataset'])) timeout_len = cfg_d['timeout'] try: with Timeout(timeout_len): res['results'] = self.get_results(cfg_file) except Timeout.Timeout: error_msg = "Timeout error: {}s".format(timeout_len) res['error'] = error_msg res['results'] = [] req['results'].append(res) response = requests.post(self.submission_url, json=req) jdata = response.json() print(jdata) self.assertTrue(jdata['submission_success'])
import metapy metapy.log_to_stderr() class TextProcessor: keywords = {'About', 'WorkExperience', 'Education', 'Certificates', 'Awards', 'Groups'} document = None tokenizer = None top_N = 5 frequency_map = dict() sentence_token_map = dict() def __init__(self, top_n=5): self.document = metapy.index.Document() self.tokenizer = metapy.analyzers.ICUTokenizer(suppress_tags=True) self.tokenizer = metapy.analyzers.LengthFilter(self.tokenizer, min=2, max=30) self.tokenizer = metapy.analyzers.ListFilter(self.tokenizer, \ "data/lemur-stopwords.txt",\ metapy.analyzers.ListFilter.Type.Reject) self.top_N = top_n def split_sentences(self, text): sentences = [] if (text != None and text != ''): for line in text.split('\n'): if ('. ') in line and 'ltd. ' not in line.lower(): pieces = line.split('. ') for piece in pieces: sentences.append(piece) else: