def wordcount(filename, ent_file, tfidf, text, id):
    resources = open(filename)
    resources.readline()  # header
    wordcount = TFIDF(get_entities(ent_file))
    for id, lines in groupby(csv.reader(resources), id):
        maintext = ' '.join(text(line).lower() for line in lines)
        wordcount.process(maintext)
    wordcount.done()

    out = open(tfidf, 'w')
    for word, _, _, tfidf in wordcount.highest(200):
        out.write('%s\t%f\n' % (word, tfidf))
def wordcount(filename, ent_file, tfidf, text, id):
  resources = open(filename)
  resources.readline() # header
  wordcount = TFIDF(get_entities(ent_file))
  for id, lines in groupby(csv.reader(resources), id):
    maintext = ' '.join(text(line).lower() for line in lines)
    wordcount.process(maintext)
  wordcount.done()

  out = open(tfidf, 'w')
  for word, _, _, tfidf in wordcount.highest(200):
    out.write('%s\t%f\n' % (word, tfidf))
Example #3
0
def count(district,
          type='essays',
          extract_text=lambda line: ' '.join(line[3:10]),
          id=lambda line: line[0]):
    (_projectid, _teacher_acctid, _schoolid, school_ncesid, school_latitude,
     school_longitude, school_city, school_state, school_zip, school_metro,
     school_district, school_county, school_charter, school_magnet,
     school_year_round, school_nlns, school_kipp, school_charter_ready_promise,
     teacher_prefix, teacher_teach_for_america, teacher_ny_teaching_fellow,
     primary_focus_subject, primary_focus_area, secondary_focus_subject,
     secondary_focus_area, resource_usage, resource_type, poverty_level,
     grade_level, vendor_shipping_charges, sales_tax,
     payment_processing_charges, fulfillment_labor_materials,
     total_price_excluding_optional_support,
     total_price_including_optional_support, students_reached,
     used_by_future_students, total_donations, num_donors,
     eligible_double_your_impact_match, eligible_almost_home_match,
     funding_status, date_posted, date_completed, date_thank_you_packet_mailed,
     date_expiration) = range(46)
    proj_ids = []
    projects = open('../data/projects.%scsv' % district)
    projects.readline().strip()  # header
    for proj in csv.reader(projects):
        if proj[date_posted].startswith('2011'):
            proj_ids.append(proj[0])
    proj_ids = frozenset(proj_ids)
    projects.close()

    wordcount = TFIDF(get_entities(ent_file))
    essays = open('../data/%s.%scsv' % (type, district))
    essays.readline()  # header
    for proid, lines in groupby(csv.reader(essays), id):
        if proid in proj_ids:
            text = ' '.join(extract_text(line) for line in lines).lower()
            wordcount.process(text)
    wordcount.done()
    essays.close()

    out = open('../data/wc_%s%scsv' % (type, district), 'w')
    for word, tf, df, tfidf in wordcount.highest(0):
        out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))
Example #4
0
def count(district, type='essays', extract_text=lambda line: ' '.join(line[3:10]), id=lambda line:line[0]):
  (_projectid,_teacher_acctid,_schoolid,school_ncesid,school_latitude,school_longitude,school_city,school_state,school_zip,school_metro,school_district,school_county,school_charter,school_magnet,school_year_round,school_nlns,school_kipp,school_charter_ready_promise,teacher_prefix,teacher_teach_for_america,teacher_ny_teaching_fellow,primary_focus_subject,primary_focus_area,secondary_focus_subject,secondary_focus_area,resource_usage,resource_type,poverty_level,grade_level,vendor_shipping_charges,sales_tax,payment_processing_charges,fulfillment_labor_materials,total_price_excluding_optional_support,total_price_including_optional_support,students_reached,used_by_future_students,total_donations,num_donors,eligible_double_your_impact_match,eligible_almost_home_match,funding_status,date_posted,date_completed,date_thank_you_packet_mailed,date_expiration) = range(46)
  proj_ids = []
  projects = open('../data/projects.%scsv' % district)
  projects.readline().strip() # header
  for proj in csv.reader(projects):
    if proj[date_posted].startswith('2011'):
      proj_ids.append(proj[0])
  proj_ids = frozenset(proj_ids)
  projects.close()

  wordcount = TFIDF(get_entities(ent_file))
  essays = open('../data/%s.%scsv' % (type, district))
  essays.readline() # header
  for proid, lines in groupby(csv.reader(essays), id):
    if proid in proj_ids:
      text = ' '.join(extract_text(line) for line in lines).lower()
      wordcount.process(text)
  wordcount.done()
  essays.close()

  out = open('../data/wc_%s%scsv' % (type, district), 'w')
  for word, tf, df, tfidf in wordcount.highest(0):
    out.write('%s\t%f\t%f\t%f\n' % (word, tf, df, tfidf))