def process_resume(email_data): path = download_attachment(email_data["attachment_url"]) text = convert_pdf_to_txt(path) text = text.rstrip() fp = file("temp.txt", "wb") fp.write(text) fp.close() resume_text_list = getresumecontent("temp.txt") resume_text = ';'.join(resume_text_list) clean_text = Stopwords.removeStopWords( unicode(resume_text, encoding="utf-8")) keywords = extract_keywords(resume_text) workex = workexfinder("temp.txt") lines = workex.split(";;") entity_names = [] string = "".join(lines) tokens = nltk.word_tokenize(string) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) for entity in entities: entity_names.extend(extract_entity_names(entity)) retval = {} retval["keywords"] = keywords retval["workex"] = entity_names return retval
def _preprocessCaption(self, cap): def removeAt(cap): # remove @eddie end_at = [' ', '\t', '#'] new_cap = '' pre_is_at = False for c in cap: if c == '@': pre_is_at = True continue if pre_is_at == True: if c in end_at: pre_is_at = False if pre_is_at == False: new_cap += c return new_cap cap = removeAt(cap) new_cap = '' pre_is_cap = False for c in cap: if c.isupper(): if not pre_is_cap: new_cap += ' ' new_cap += c.lower() pre_is_cap = True continue if c.islower(): new_cap += c else: new_cap += ' ' pre_is_cap = False words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict
def _preprocessCaption(self, cap): def removeAt(cap): # remove @eddie end_at = [' ', '\t', '#'] new_cap = '' pre_is_at = False for c in cap: if c =='@': pre_is_at = True continue if pre_is_at == True: if c in end_at: pre_is_at = False if pre_is_at == False: new_cap += c return new_cap cap = removeAt(cap) new_cap = '' pre_is_cap = False for c in cap: if c.isupper(): if not pre_is_cap: new_cap += ' ' new_cap += c.lower() pre_is_cap = True continue if c.islower(): new_cap += c else: new_cap += ' ' pre_is_cap = False words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict
def _preprocessText(self, cap): new_cap = tool.textPreprocessor(cap) words = new_cap.split() stopword_list = Stopwords.stopwords() tmp_dict = {} for word in words: word = word.strip() if self._stopword_removal and word in stopword_list: continue if len(word) < 3: continue if word in tmp_dict.keys(): tmp_dict[word] = tmp_dict[word] + 1 else: tmp_dict[word] = 1 return tmp_dict
re_matcher = re.compile("^https?://.*ics.uci.edu") def get_links(html): links = [] soup = BeautifulSoup(html, "html.parser") for link in soup.findAll('a', attrs={'href': re_matcher}): links.append(link.get('href')) return links def hasdigit(token): return any(c.isdigit() for c in token) stopw = Stopwords() def check_token(token): return not stopw.is_stop(token) and not hasdigit( token) and len(token) > 1 and len(token) < 20 def add_token(token): pass nonalphanum = re.compile("[^0-9a-z']") def tokenize_text(intext):
from zope.component.testing import setUp from index import Index from parsers.english import EnglishParser from splitter import SplitterFactory from stopwords import Stopwords from zopyx.txng3.core.interfaces import IParser, IStopwords, IThesaurus from zopyx.txng3.core.lexicon import LexiconFactory from zopyx.txng3.core.storage import StorageWithTermFrequencyFactory from zopyx.txng3.core.thesaurus import GermanThesaurus # Setup environment setUp() provideUtility(SplitterFactory, IFactory, 'txng.splitters.default') provideUtility(EnglishParser(), IParser, 'txng.parsers.en') provideUtility(Stopwords(), IStopwords, 'txng.stopwords') provideUtility(LexiconFactory, IFactory, 'txng.lexicons.default') provideUtility(StorageWithTermFrequencyFactory, IFactory, 'txng.storages.default') provideUtility(GermanThesaurus, IThesaurus, 'txng.thesaurus.de') try: import readline histfile = os.path.expanduser('~/.pyhist') readline.read_history_file(histfile) atexit.register(readline.write_history_file, histfile) except: pass class Text: