Beispiel #1
0
def process_resume(email_data):
    path = download_attachment(email_data["attachment_url"])
    text = convert_pdf_to_txt(path)
    text = text.rstrip()
    fp = file("temp.txt", "wb")
    fp.write(text)
    fp.close()
    resume_text_list = getresumecontent("temp.txt")
    resume_text = ';'.join(resume_text_list)
    clean_text = Stopwords.removeStopWords(
        unicode(resume_text, encoding="utf-8"))
    keywords = extract_keywords(resume_text)
    workex = workexfinder("temp.txt")
    lines = workex.split(";;")
    entity_names = []
    string = "".join(lines)
    tokens = nltk.word_tokenize(string)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged)
    for entity in entities:
        entity_names.extend(extract_entity_names(entity))
    retval = {}
    retval["keywords"] = keywords
    retval["workex"] = entity_names
    return retval
Beispiel #2
0
    def _preprocessCaption(self, cap):
        def removeAt(cap):
            # remove @eddie
            end_at = [' ', '\t', '#']
            new_cap = ''
            pre_is_at = False
            for c in cap:
                if c == '@':
                    pre_is_at = True
                    continue

                if pre_is_at == True:
                    if c in end_at:
                        pre_is_at = False

                if pre_is_at == False:
                    new_cap += c

            return new_cap

        cap = removeAt(cap)

        new_cap = ''
        pre_is_cap = False
        for c in cap:
            if c.isupper():
                if not pre_is_cap:
                    new_cap += ' '
                new_cap += c.lower()
                pre_is_cap = True
                continue

            if c.islower():
                new_cap += c
            else:
                new_cap += ' '
            pre_is_cap = False

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict
Beispiel #3
0
	def _preprocessCaption(self, cap):
		def removeAt(cap):
			# remove @eddie
			end_at = [' ', '\t', '#']
			new_cap = ''
			pre_is_at = False
			for c in cap:
				if c =='@':
					pre_is_at = True
					continue
				
				if pre_is_at == True:
					if c in end_at:
						pre_is_at = False
				
				if pre_is_at == False:
					new_cap += c
			
			return new_cap
			
		cap = removeAt(cap)
			
		new_cap = ''
		pre_is_cap = False
		for c in cap:
			if c.isupper():
				if not pre_is_cap:
					new_cap += ' '
				new_cap += c.lower()
				pre_is_cap = True
				continue

			if c.islower():
				new_cap += c
			else:
				new_cap += ' '
			pre_is_cap = False
			 
		words = new_cap.split()
		stopword_list = Stopwords.stopwords()
		tmp_dict = {} 
		
		for word in words:
			word = word.strip()
			if self._stopword_removal and word in stopword_list:
				continue
			if len(word) < 3:
				continue
			if word in tmp_dict.keys():
				tmp_dict[word] = tmp_dict[word] + 1
			else:
				tmp_dict[word] = 1
		return tmp_dict
Beispiel #4
0
    def _preprocessText(self, cap):

        new_cap = tool.textPreprocessor(cap)

        words = new_cap.split()
        stopword_list = Stopwords.stopwords()
        tmp_dict = {}

        for word in words:
            word = word.strip()
            if self._stopword_removal and word in stopword_list:
                continue
            if len(word) < 3:
                continue
            if word in tmp_dict.keys():
                tmp_dict[word] = tmp_dict[word] + 1
            else:
                tmp_dict[word] = 1
        return tmp_dict
re_matcher = re.compile("^https?://.*ics.uci.edu")


def get_links(html):
    links = []
    soup = BeautifulSoup(html, "html.parser")
    for link in soup.findAll('a', attrs={'href': re_matcher}):
        links.append(link.get('href'))
    return links


def hasdigit(token):
    return any(c.isdigit() for c in token)


stopw = Stopwords()


def check_token(token):
    return not stopw.is_stop(token) and not hasdigit(
        token) and len(token) > 1 and len(token) < 20


def add_token(token):
    pass


nonalphanum = re.compile("[^0-9a-z']")


def tokenize_text(intext):
Beispiel #6
0
from zope.component.testing import setUp

from index import Index
from parsers.english import EnglishParser
from splitter import SplitterFactory
from stopwords import Stopwords
from zopyx.txng3.core.interfaces import IParser, IStopwords, IThesaurus
from zopyx.txng3.core.lexicon import LexiconFactory
from zopyx.txng3.core.storage import StorageWithTermFrequencyFactory
from zopyx.txng3.core.thesaurus import GermanThesaurus

# Setup environment
setUp()
provideUtility(SplitterFactory, IFactory, 'txng.splitters.default')
provideUtility(EnglishParser(), IParser, 'txng.parsers.en')
provideUtility(Stopwords(), IStopwords, 'txng.stopwords')
provideUtility(LexiconFactory, IFactory, 'txng.lexicons.default')
provideUtility(StorageWithTermFrequencyFactory, IFactory,
               'txng.storages.default')
provideUtility(GermanThesaurus, IThesaurus, 'txng.thesaurus.de')

try:
    import readline
    histfile = os.path.expanduser('~/.pyhist')
    readline.read_history_file(histfile)
    atexit.register(readline.write_history_file, histfile)
except:
    pass


class Text: