print n[1] millis = convertTime(n[1]) addEvent["starting_time"] = millis if n[0].isdigit(): millis = convertTime(n[0]) addEvent["starting_time"] = millis addEvent["description"] = n[2] if addEvent["description"]!="description" and addEvent["starting_time"]!=1: addEvent["color"]='orange' print 'addingEvent' print addEvent timeline["times"].append(addEvent) addEvent={"color":"blue", "description":"description", "starting_time": 1} outerMost.append(timeline) return outerMost if __name__ == '__main__': allInformation = readCsv() newFormation = reformat(allInformation) finalFormation = webToJson(newFormation) convertToFile('usTimeline.json',finalFormation)
for x in soup.find_all(): if len(x.text) == 0: x.extract() for tag in invalid_tags: for match in soup.findAll(tag): match.replaceWithChildren() timelineBox = soup.find_all(class_ = "MsoNormal") timeline = [] for stuff in timelineBox: timelineJunk=stuff.find_all("span") for junk in timelineJunk: if len(junk)>0: toClean = junk.contents[0] clean = unicode(toClean) print clean clean = clean.encode('ascii', 'ignore') timeline.append(clean) print timeline results = webToJson(timeline) convertToFile('timeline1.json', results)
outerMost = [] for n in soup: if n.isdigit(): millis = convertTime(n) addEvent["starting_time"] = millis else: addEvent["description"] = n if addEvent["description"]!="description" and addEvent["starting_time"]!=1: addEvent["color"]='green' timeline["times"].append(addEvent) addEvent={"color":"blue", "description":"description", "starting_time": 1} outerMost.append(timeline) return outerMost if __name__ == '__main__': url ="http://tgmaa.weebly.com/chronology.html" parsed = parsePage(url) converted = webToJson(parsed) convertToFile('timeline2.json',converted)
"http://library.howard.edu/content.php?pid=257155&sid=2164686") data = r.text soup = BeautifulSoup(data) invalid_tags = ['b', 'i', 'u'] for x in soup.find_all(): if len(x.text) == 0: x.extract() for tag in invalid_tags: for match in soup.findAll(tag): match.replaceWithChildren() timelineBox = soup.find_all(class_="MsoNormal") timeline = [] for stuff in timelineBox: timelineJunk = stuff.find_all("span") for junk in timelineJunk: if len(junk) > 0: toClean = junk.contents[0] clean = unicode(toClean) print clean clean = clean.encode('ascii', 'ignore') timeline.append(clean) print timeline results = webToJson(timeline) convertToFile('timeline1.json', results)