コード例 #1
0
ファイル: csv2db.py プロジェクト: abiliooliveira/csv2psqldb
def toDB(file_path, server_name, db_name, table_name, user, password, csv_delimiter):

	folder, file = os.path.split(file_path)
	with Util.cd(folder):
		reader = UnicodeReader(open(file,'rb'))
		db = Database(db_name, user, server_name, 5432, password)
		csv_delimiter = csv_delimiter if csv_delimiter != "" else ";"
		fields, values = getInsertCommand(reader.next()[0].split(csv_delimiter), table_name)

		print 'running...'
		v = []
		while True:
			try:
				row = reader.next()[0].split(',')
			except Exception:
				break;

			for i in range(len(row)):
				if row[i] == 'None':
					row[i] = ''

			v.append(tuple(row))
			
		args_str = ','.join(db.cur.mogrify(values, x) for x in tuple(v))
		db.cur.execute("INSERT INTO " + table_name + " " + fields + " VALUES " + args_str)
		# db.cur.executemany(insert_command, tuple(v))
		db.conn.commit()

		db.conn.close()
	
	print 'done!'
コード例 #2
0
    def loadGenderList(gender, country, dataPath, hasHeader):
        fd = open(os.path.join(dataPath, '%s%sUTF8.csv' % (country, gender)),
                  'rb')
        reader = UnicodeReader(fd)
        names = {}
        if hasHeader:
            unused_header = reader.next()
        '''Load names as-is, but lower cased'''
        for row in reader:
            name = row[0].lower()
            try:
                '''The second column should be the count
				(number of babies in some year with this name)'''
                count = row[1]
            except:
                '''If second column does not exist, default to count=1'''
                count = 1
                if names.has_key(name):
                    '''If here then I've seen this name before, modulo case.
					Only count once (there is no frequency information anyway)'''
                    count = 0
            if names.has_key(name):
                names[name] += count
            else:
                names[name] = count
        fd.close()
        '''Add versions without diacritics'''
        for name in names.keys():
            dname = unidecode(name)
            if not names.has_key(dname):
                names[dname] = names[name]

        return names
コード例 #3
0
	def loadGenderList(gender, country, dataPath, hasHeader):
		fd = open(os.path.join(dataPath, '%s%sUTF8.csv' % (country, gender)), 'rb')
		reader = UnicodeReader(fd)
		names = {}
		if hasHeader:
			unused_header = reader.next()
		'''Load names as-is, but lower cased'''
		for row in reader:
			name = row[0].lower()
			try:
				'''The second column should be the count
				(number of babies in some year with this name)'''
				count = row[1]
			except:
				'''If second column does not exist, default to count=1'''
				count = 1
				if names.has_key(name):
					'''If here then I've seen this name before, modulo case.
					Only count once (there is no frequency information anyway)'''
					count = 0
			if names.has_key(name):
				names[name] += count
			else:
				names[name] = count
		fd.close()
		
		'''Add versions without diacritics'''
		for name in names.keys():
			dname = unidecode(name)
			if not names.has_key(dname):
				names[dname] = names[name]

		return names
コード例 #4
0
def load_acceptance_ratio():
  session = SessionFactory.get_session()
  print 'Loading acceptance ratios:'
  f = open(os.path.join(DATA_PATH, 'numSubmissions.csv'), "rb")
  reader = UnicodeReader(f)
  header = reader.next()
  subm = {}
  for row in reader:
      year = int(row[0])
      for idx,val in enumerate(row[1:]):
          conf = header[idx+1]
          try:
              count = int(val)
              if conf not in subm.keys():
                  subm[conf] = {}
              subm[conf][year] = count
          except:
              pass


  for acronym, name, impact in CONFERENCES:
      print acronym.upper()
      conference = session.query(Venue).\
              filter_by(acronym=acronym.upper()).\
              one()

      for (year,count) in subm[acronym.upper()].items():
          numSubm = SubmissionsCount(year, count)
          numSubm.venue = conference
          session.add(numSubm)
コード例 #5
0
                author = session.query(Person).\
                        filter_by(name=author_name).\
                        one()
            except:
                # New author; add to database
                author = Person(author_name)
                session.add(author)

            paper.authors.append(author)


# --- Update 8/12/2013 ---
# Record also the role of PC members (PC Chair or General Chair)
f = open(os.path.join(dataPath, 'SE-conf-roles.csv'), 'rb')
reader = UnicodeReader(f)
header = reader.next()
roles = {}

def confName(conf):
    if conf == 'ESEC/FSE':
        return 'FSE'
    elif conf == 'CSMR-WCRE':
        return 'CSMR'
    else:
        return conf
     
for row in reader:
    conf = confName(row[0].strip()).lower()
    year = int(row[1])
    name = '%s %s' % (row[2].strip(), row[3].strip())
    try:
コード例 #6
0
ファイル: lsaAlgorithm.py プロジェクト: tue-mdse/aliasMerger
	# Choose dataset
	#dataset = 'aliases2'
	dataset = 'gnome'
	dataset = 'icsm'
	
	# Choose dataset size
	datasetsize = 'full'
	#datasetsize = 'sample'
	
	nameEmailData = MyDict()
	if dataset == 'aliases2':
		f = open(os.path.join(dataPath, "aliases2.csv"), "rb")
	#	f = open(os.path.join(dataPath, "testData2.csv"), "rb")
		reader = UnicodeReader(f)
		header = reader.next()
			
		for row in reader:
			try:
				idx = int(row[0])
				name = row[1]
				email = unspam(row[2])
				nameEmailData[idx] = (name, email)
			except:
				print row
		f.close()
		print 'Using the aliases2.csv data set...'
	elif dataset == 'gnome':
		import email.utils
		import email.header
	
コード例 #7
0
def load_pc():
  session = SessionFactory.get_session()
  # --- Update 8/12/2013 ---
  # Record also the role of PC members (PC Chair or General Chair)
  f = open(os.path.join(DATA_PATH, 'SE-conf-roles.csv'), 'rb')
  reader = UnicodeReader(f)
  header = reader.next()
  roles = {}

  def confName(conf):
      if conf == 'ESEC/FSE':
          return 'FSE'
      elif conf == 'CSMR-WCRE':
          return 'CSMR'
      else:
          return conf

  for row in reader:
      conf = confName(row[0].strip()).lower()
      year = int(row[1])
      name = '%s %s' % (row[2].strip(), row[3].strip())
      try:
          name = nameMap[name]
      except:
          pass
      role = row[5]
      if role == 'Organiser':
          role = 'PC member main track'

      if '?' not in name and role != 'Challenge Chair' and role != 'Data Chair':
          roles[(name, conf, year)] = role

  #Conference;Year;First Name;Last Name;Sex;Role
  #CSMR;2013;Anthony ;Cleve;Male;Program Chair
  #CSMR;2013;Filippo;Ricca;Male;Program Chair
  #CSMR;2013;Maura;Cerioli;Female;General Chair
  # -----------------------


  print 'Loading PC members:'
  for acronym, name, impact in CONFERENCES:
      print acronym.upper()

      # Get the conference object
      try:
          # I already have this PC conference in the database
          conference = session.query(Venue).\
                  filter_by(acronym=acronym).\
                  one()
      except:
          # New conference; add to database
          conference = Venue(acronym.upper(), impact, name, is_conference=True)
          session.add(conference)

      # Load the data into a csv reader
      f = open(os.path.join(DATA_PATH, 'normalised-pc', '%s.csv' % acronym.lower()), 'rb')
      reader = UnicodeReader(f)

      # --- Update 8/12/2013 ---
      withRole = set([(name, year) for (name, conf, year) in roles.keys() if conf==acronym])
      # -----------------------

      for row in reader:
          # Deconstruct each row
          year = int(row[0])
          role = row[1]
          pcMemberName = row[2].strip()

          # --- Update 8/12/2013 ---
          if roles.has_key((pcMemberName, acronym, year)):
              role = roles[(pcMemberName, acronym, year)]
              try:
                  withRole.remove((pcMemberName, year))
              except:
                  pass
          else:
              role = 'PC member main track'
          # -----------------------

          if len(pcMemberName):
              # Get the PC member object
              try:
                  # I already have this PC member in the database
                  pcMember = session.query(Person).\
                          filter_by(name=pcMemberName).\
                          one()
              except:
                  # New person; add to database
                  pcMember = Person(pcMemberName)
                  session.add(pcMember)

              try:
                  membership = session.query(PCMembership).\
                          filter_by(year=year).\
                          filter_by(role=role).\
                          filter_by(pcmember=pcMember).\
                          filter_by(venue=conference).\
                          one()
              except:
                  # New, add to database
                  membership = PCMembership(year, role)

                  membership.pcmember = pcMember
                  membership.venue = conference
                  session.add(membership)

      # --- Update 8/12/2013 ---
      print sorted(withRole)
      # -----------------------
  session.commit()