def computeSkillIndexes(profile, skills): """Compute skill index values""" skillindex = dict() for item in profile['skills']: skillFlag = False for category, topics in skills.items(): if item in topics: # If the skill is already present in one of our lists skillindex[category] = skillindex.get(category, 0) + 100.0/len(profile['skills']) skillFlag = True if not skillFlag: # if not, get related skills from the relevant linkedin page category1,category2,category3,category4 = categorizer.categorize(item) catset = set([category1, category2, category3,category4]) for cat in catset: skills[cat].append(item) skillindex[cat] = skillindex.get(cat, 0) + 100.0/len(profile['skills']) #writeback(cat, item) #relatedskills = scraper.extractRelatedSkills(item) #for relskill in relatedskills: # for category, topics in skills.items(): # if relskill in topics: # skillFlag = False # else: # # if none of the related skills are present in any of the lists, # # go the extra mile and make a wild guess on which category it might belong to # if not skillFlag: # category1,category2,category3,category4 = categorizer.categorize(relskill) # catset = set([category1, category2, category3,category4]) # print relskill, ':', catset # for cat in catset: # skills[cat].append(relskill) # #writeback(cat, relskill) return skillindex
def doc_categorize(fsort): cat_prob = [0,0,0,0] for term in fsort: print term[0] for kw in fsort: term_corr = [kw[1]*term for term in categorizer.categorize(kw[0])] cat_prob = map(operator.add, cat_prob, term_corr) print 'cumulative probability :', cat_prob return cat_prob
def run_server(kw_list,log): global source_probs cat_list = {} cat_list['general'] = wiki_list = [("Wikipedia", "site:wikipedia.org"), ("Citizendium", "site:citizendium.org"), ("Britannica", "site:britannica.com")] cat_list['technology'] = [("gizmodo","gizmodo"), ("theverge","theverge"), ("engadget","engadget")] source_probs['general'] = phase_one.phase1_update(source_probs['general'], cat_list['general'], [item[0] for item in kw_list], 1, True, log) pickle.dump( source_probs, open( "source_probabilities.sp", "wb" ) ) for kw in kw_list: kw_category = string.lower(cat_mod.categorize(kw[0])) print kw, kw_category get_uri(kw,cat_list['general'], source_probs['general'], 2) if kw_category in cat_list.keys(): get_uri(kw,cat_list[kw_category], source_probs['technology'], 2) print_uri()
def process_davivienda_message(message): print('Davivienda: processing message') amount = re.sub(r'[^\d.]', '', nth(lines(message.string), 6)) category_type = nth(lines(message.string), 7) desc = trim(nth(split(nth(lines(message.string), 8), ':'), 1)) if has_substr(upper_case(desc), 'PSE'): print('Davivienda: Ignored PSE payment') return None, None, None if has_substr(lower_case(category_type), 'deposito') or has_substr( lower_case(category_type), 'abono'): category_type = moneylover.CATEGORY_TYPE['income'] category_item = 'Salary' if has_substr( upper_case(desc), 'ACH GNB SUDAMERIS') else 'Others' else: category_type = moneylover.CATEGORY_TYPE['expense'] category_item = categorize(desc) return amount, {'type': category_type, 'item': category_item}, desc
def process_pse_message(message): print('PSE(davivienda): processing message') data = lines(last(message.table.table.find_all('span'))) desc = re.sub(r'<[^<>]*>', '', nth(data)) amount = re.sub(r'[^\d,]', '', nth(data, 1)).replace(',', '.') is_visa = has_substr(lower_case(desc), 'credito visa') if is_visa: visa_category_type = moneylover.CATEGORY_TYPE['income'] visa_category_item = 'Payment' category_type = moneylover.CATEGORY_TYPE['expense'] category_item = categorize(desc) return amount, { 'type': category_type, 'item': category_item }, desc, { 'type': visa_category_type, 'item': visa_category_item } if is_visa else None
def computeSkillIndexes(profile, skills): """Compute skill index values""" skillindex = dict() for item in profile['skills']: skillFlag = False for category, topics in skills.items(): if item in topics: # If the skill is already present in one of our lists skillindex[category] = skillindex.get( category, 0) + 100.0 / len(profile['skills']) skillFlag = True if not skillFlag: # if not, get related skills from the relevant linkedin page category1, category2, category3, category4 = categorizer.categorize( item) catset = set([category1, category2, category3, category4]) for cat in catset: skills[cat].append(item) skillindex[cat] = skillindex.get( cat, 0) + 100.0 / len(profile['skills']) #writeback(cat, item) #relatedskills = scraper.extractRelatedSkills(item) #for relskill in relatedskills: # for category, topics in skills.items(): # if relskill in topics: # skillFlag = False # else: # # if none of the related skills are present in any of the lists, # # go the extra mile and make a wild guess on which category it might belong to # if not skillFlag: # category1,category2,category3,category4 = categorizer.categorize(relskill) # catset = set([category1, category2, category3,category4]) # print relskill, ':', catset # for cat in catset: # skills[cat].append(relskill) # #writeback(cat, relskill) return skillindex
from categorizer import categorize from datetime import datetime def Average(lst): return sum(lst) / len(lst) if __name__ == '__main__': tags = [ 'XMR/BTC', 'BTC Options OI held', 'DOGEBTC (TradingView)', 'BTCB', 'get-more-btc', 'filbtc', 'BTCADA (TradingView)', 'bitcoin cash prices', 'bitcoin btc/btcp makes bitcoin cash increase', 'bitcoin cash increase' ] start_time = datetime.now() result = categorize(tags) result_time = (datetime.now() - start_time).microseconds / 1000 print(f"Time to categorize {len(tags)} Tags : ", result_time, "milliseconds") print("Time per Tag : ", result_time / len(tags), "milliseconds") print("tags :\n", tags) print("categories :\n", result)
#all in one place from categorizer import categorize categorize( "/content/drive/My Drive/Amazon Review Data/Office_Products_5.json.gz", prefix='office') categorize('/content/drive/My Drive/Amazon Review Data/Books_5.json.gz', prefix='books') categorize( '/content/drive/My Drive/Amazon Review Data/Cell_Phones_and_Accessories_5.json.gz', prefix='cell_phone') categorize( '/content/drive/My Drive/Amazon Review Data/Clothing_Shoes_and_Jewelry_5.json.gz', prefix='clothing_shoe_jewellery') categorize('/content/drive/My Drive/Amazon Review Data/Electronics_5.json.gz', prefix='Electronics') categorize('/content/drive/My Drive/Amazon Review Data/Kindle_Store_5.json.gz', prefix='Kindle') categorize( '/content/drive/My Drive/Amazon Review Data/Movies_and_TV_5.json.gz', prefix='Movies') categorize( '/content/drive/My Drive/Amazon Review Data/Sports_and_Outdoors_5.json.gz', prefix='Sports') delivery_data_paths = [ 'drive/My Drive/CSV/office_delivery.csv', 'drive/My Drive/CSV/books_delivery.csv', 'drive/My Drive/CSV/cell_phone_delivery.csv', 'drive/My Drive/CSV/clothing_shoe_jewellery_delivery.csv', 'drive/My Drive/CSV/Electronics_delivery.csv',
def process_scotiabank_message(message): print('Scotiabank: processing message') desc = nth(message.table.find_all('p'), 3).string amount = re.sub(r'[^\d.]', '', nth(message.table.find_all('p'), 4).string) category_item = categorize(desc) return amount, {'type': 'EXPENSE', 'item': category_item}, desc