def reducefn(author, titles): from stopwords import allStopWords from utils import normalize_words terms_freq = {} for title in titles: ### Main tasks to do: normalize words (lower case), delete stopwords, punctuation, ### hyphens, single letter words and count the cumulative frequency. title_terms = normalize_words(title) for term in title_terms.split(): ## allStopWords comes from stopwords file. ## Single letter words are removed. if not (allStopWords.has_key(term) or len(term) == 1): if terms_freq.has_key(term): terms_freq[term] = terms_freq[term] + 1 else: terms_freq[term] = 1 ## Reduce results. return terms_freq
def reducefn(author, titles): from stopwords import allStopWords from utils import normalize_words terms_freq = {} for title in titles: ### Main tasks to do: normalize words (lower case), delete stopwords, punctuation, ### hyphens, single letter words and count the cumulative frequency. title_terms = normalize_words(title) for term in title_terms.split(): ## allStopWords comes from stopwords file. ## Single letter words are removed. if not (allStopWords.has_key(term) or len(term) == 1): if terms_freq.has_key(term): terms_freq[term] = terms_freq[term] + 1 else: terms_freq[term] = 1 ## Reduce results. return terms_freq
def mapfn(filenumber, filecontent): from utils import normalize_words ## Emits all the contents for each author author_contents = {} for line in filecontent.splitlines(): ## conf:::author_1::author2:::title docdata = line.split(':::') authors_list = docdata[1].split('::') title = docdata[-1] for author in authors_list: author = normalize_words(author) if author_contents.has_key(author): author_contents[author] = author_contents[author] + " " + title else: author_contents[author] = title ## Map results for author in author_contents.keys(): yield author, author_contents[author]
def mapfn(filenumber, filecontent): from utils import normalize_words ## Emits all the contents for each author author_contents = {} for line in filecontent.splitlines(): ## conf:::author_1::author2:::title docdata = line.split(':::') authors_list = docdata[1].split('::') title = docdata[-1] for author in authors_list: author = normalize_words(author) if author_contents.has_key(author): author_contents[author] = author_contents[author] + " " + title else: author_contents[author] = title ## Map results for author in author_contents.keys(): yield author, author_contents[author]