Python TFIDF.process Examples

Programming Language: Python

Namespace/Package Name: tfidf

Class/Type: TFIDF

Method/Function: process

Examples at hotexamples.com: 4

Python TFIDF.process - 4 examples found. These are the top rated real world Python examples of tfidf.TFIDF.process extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TFIDF(30)

transform(4)

highest(2)

get_tfidf(2)

process(2)

tf_idf(2)

done(2)

docHandler(2)

to_array(1)

tfidf(1)

similarity(1)

similar_docs(1)

save_to_pickle(1)

relevancy(1)

prepare_data(1)

train_from_text(1)

normalized_additive_idf_ignore_common_words(1)

get_tfidf_dataframe(1)

get_tfs(1)

calc_cosine_similarity(1)

get_from_pickle(1)

getTFIDF(1)

getOnlyXData(1)

getOnlyX(1)

getIDF(1)

gen_vector(1)

from_array(1)

fit_transform(1)

findNumDocs(1)

create(1)

calculate_tfidf_document(1)

update(1)

Example #1

Show file

File: get_tfidf_words.py Project: angadgadre/DataInColour

def wordcount(filename, ent_file, tfidf, text, id):
    resources = open(filename)
    resources.readline()  # header
    wordcount = TFIDF(get_entities(ent_file))
    for id, lines in groupby(csv.reader(resources), id):
        maintext = ' '.join(text(line).lower() for line in lines)
        wordcount.process(maintext)
    wordcount.done()

    out = open(tfidf, 'w')
    for word, _, _, tfidf in wordcount.highest(200):
        out.write('%s\t%f\n' % (word, tfidf))

Example #2

Show file

File: get_tfidf_words.py Project: aravindanbalan/DataInColour

def wordcount(filename, ent_file, tfidf, text, id):
  resources = open(filename)
  resources.readline() # header
  wordcount = TFIDF(get_entities(ent_file))
  for id, lines in groupby(csv.reader(resources), id):
    maintext = ' '.join(text(line).lower() for line in lines)
    wordcount.process(maintext)
  wordcount.done()

  out = open(tfidf, 'w')
  for word, _, _, tfidf in wordcount.highest(200):
    out.write('%s\t%f\n' % (word, tfidf))

Example #3

Show file

File: data_2011.py Project: angadgadre/DataInColour

def count(district,
          type='essays',
          extract_text=lambda line: ' '.join(line[3:10]),
          id=lambda line: line[0]):
    (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude,
     school_longitude, school_city, school_state, school_zip, school_metro,
     school_district, school_county, school_charter, school_magnet,
     school_year_round, school_nlns, school_kipp, school_charter_ready_promise,
     teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow,
     primary_focus_subject, primary_focus_area, secondary_focus_subject,
     secondary_focus_area, resource_usage, resource_type, poverty_level,
     grade_level, vendor_shipping_charges, sales_tax,
     payment_processing_charges, fulfillment_labor_materials,
     total_price_excluding_optional_support,
     total_price_including_optional_support, students_reached,
     used_by_future_students, total_donations, num_donors,
     eligible_double_your_impact_match, eligible_almost_home_match,
     funding_status, date_posted, date_completed, date_thank_you_packet_mailed,
     date_expiration) = range(46)
    proj_ids = []
    projects = open('../data/projects.%scsv' % district)
    projects.readline().strip()  # header
    for proj in csv.reader(projects):
        if proj[date_posted].startswith('2011'):
            proj_ids.append(proj[0])
    proj_ids = frozenset(proj_ids)
    projects.close()

    wordcount = TFIDF(get_entities(ent_file))
    essays = open('../data/%s.%scsv' % (type, district))
    essays.readline()  # header
    for proid, lines in groupby(csv.reader(essays), id):
        if proid in proj_ids:
            text = ' '.join(extract_text(line) for line in lines).lower()
            wordcount.process(text)
    wordcount.done()
    essays.close()

    out = open('../data/wc_%s%scsv' % (type, district), 'w')
    for word, tf, df, tfidf in wordcount.highest(0):
        out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))

Example #4

Show file

File: data_2011.py Project: aravindanbalan/DataInColour

def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line:line[0]):
  (_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,teacher_ny_teaching_fellow,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_usage,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,used_by_future_students,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration) = range(46)
  proj_ids = []
  projects = open('../data/projects.%scsv' % district)
  projects.readline().strip() # header
  for proj in csv.reader(projects):
    if proj[date_posted].startswith('2011'):
      proj_ids.append(proj[0])
  proj_ids = frozenset(proj_ids)
  projects.close()

  wordcount = TFIDF(get_entities(ent_file))
  essays = open('../data/%s.%scsv' % (type, district))
  essays.readline() # header
  for proid, lines in groupby(csv.reader(essays), id):
    if proid in proj_ids:
      text = ' '.join(extract_text(line) for line in lines).lower()
      wordcount.process(text)
  wordcount.done()
  essays.close()

  out = open('../data/wc_%s%scsv' % (type, district), 'w')
  for word, tf, df, tfidf in wordcount.highest(0):
    out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))