def preprocessing(corpus): for document in corpus: doc = strip_numeric(document) doc = remove_stopwords(doc) doc = strip_short(doc, 3) #doc = stem_text(doc) doc = strip_punctuation(doc) strip_tags(doc) yield gensim.utils.tokenize(doc, lower=True)
def compute_tokens(steam_sentences=None, save_to_disk=False, use_spacy=False): print('Computing tokens') if steam_sentences is None: steam_sentences = load_raw_data() counter = 0 num_games = len(steam_sentences) steam_tokens = {} # You need to have downloaded the model first. Reference: https://spacy.io/models/en#section-en_core_web_lg nlp = spacy.load('en_core_web_lg') for app_id in steam_sentences: game_data = steam_sentences[app_id] counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_data['name'])) if use_spacy: original_str = str(strip_tags(game_data['text'])) original_str = original_str.replace('\t', ' ') # Reference: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/ original_str = original_str.strip().replace('\n', ' ').replace('\r', ' ') original_str = original_str.replace('&', 'and').replace('>', '>').replace('<', '<') doc = nlp(original_str) ents = [str(entity).strip() for entity in doc.ents] # Named entities. # Keep only words (no numbers, no punctuation). # Lemmatize tokens, remove punctuation and remove stopwords. doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop] # Add named entities, but only if they are a compound of more than word. relevant_entities = [str(entity) for entity in ents if len(entity) > 1] doc.extend(relevant_entities) game_tokens = doc else: game_tokens = simple_preprocess(remove_stopwords(strip_tags(game_data['text'])), deacc=True, min_len=3) steam_tokens[app_id] = list(game_tokens) if save_to_disk: with open(get_token_file_name(), 'w') as f: json.dump(steam_tokens, f) return steam_tokens
def getLemmatizedText(name, content, language): language = language[:2] language = language.lower() outText = "" if (language): if (language=="is"): outText = getLemmatizedTextIS(name, content) print("IS") else: outText = lemmatizerMultilanguage.getLemmatizedText(language, name+" "+content) print(language.upper()) else: text = name+" "+content outText = text.lower().replace('.','.') print("ERROR: No language for Lemmatizing text") cleaned = re.sub(' +', ' ',outText) cleaned = cleaned.replace('\n', '') cleaned = cleaned.replace('\r', '') cleaned = remove_stopwords(cleaned) cleaned = strip_tags(cleaned) cleaned = strip_punctuation(cleaned) cleaned = strip_numeric(cleaned) cleaned = strip_short(cleaned, 1) cleaned = strip_multiple_whitespaces(cleaned) cleaned = cleaned.lower() print("Lemmatized CLEAN: "+cleaned) return cleaned
def pre_process(s): s = str(s) s = strip_tags(s) s = deaccent(s) s = strip_multiple_whitespaces(s) s = s.lower() return s
def texts_to_sents(texts, model="en_core_web_sm", remove_stop=True, lemmatize=True): """ transform list of texts to list of sents (list of tokens) and apply simple text preprocessing """ texts = [strip_tags(t) for t in texts] results = [] assert spacy is not None, 'please install spacy, i.e., "pip install spacy"' try: nlp = spacy.load(model, disable=["ner"]) except Exception as e: print(e, "\ntrying to download model...") os.system("python -m spacy download " + model) nlp = spacy.load(model, disable=["ner"]) for doc in tqdm(nlp.pipe(texts), total=len(texts), desc="texts to sents"): for s in doc.sents: results.append([ simple_preproc( strip_non_alphanum(t.lemma_ if lemmatize else t.text)) for t in s if not any((t.is_punct, t.is_space, remove_stop and t.is_stop)) ]) return results
def sentence_tokenize_and_word_tokenize_and_remove_stop_words( text, tokenizer, stop_word1, stop_word2): try: if isinstance(text, str): sentences = tokenizer.tokenize(text.lower()) else: sentences = tokenizer.tokenize(str(text).lower()) except UnicodeDecodeError as e: return '' if len(sentences) == 0: return '' text_total = '' for sentence in sentences: words = sentence.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word1, words)) try: text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) words = text.split() if len(words) == 0: continue text = ' '.join(filter(lambda x: x not in stop_word2, words)) text_total = text_total + text.encode('utf-8') + '#' except UnicodeDecodeError as e: pass return text_total
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" if isfloat(text): try: if math.isnan(text): return '' except TypeError: print('text: {}'.format(text)) return '' # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Expand contractions: you're to you are and so on. # text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags and numbers: can numbers possible be useful? text = preprocessing.strip_tags(preprocessing.strip_numeric(text)) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(text)) #text = re.sub(r'[^\w\s]', '', text.lower()) # STEMMING (Porter) automatically lower-cases as well # To stem or not to stem, that is the question #text = preprocessing.stem_text(text) return text
def _normalize_target(s): s = s.lower() for k, v in contractions.items(): s.replace(k, v) return strip_multiple_whitespaces(strip_punctuation(strip_tags(s))).split()
def clean(sx): sx = strip_tags(sx) sx = strip_numeric(sx) sx = re.sub(r'\n', ' ', sx) sx = re.sub(r'\[', '', sx) sx = re.sub(r'\]', '', sx) sx = strip_multiple_whitespaces(sx) return sx
def _normalize(s): s = s.lower() for k, v in contractions.items(): s.replace(k, v) return strip_multiple_whitespaces( strip_non_alphanum( strip_numeric(remove_stopwords(strip_punctuation( strip_tags(s)))))).split()
def clean_text(text): """ Cleans the text in the only argument in various steps. NOT USED. ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" # Expand contractions: you're to you are and so on. text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags text = preprocessing.strip_tags(text) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces(preprocessing.strip_punctuation(text)) return text
def preprocess_text(self, text, tags=False, remove_digits=True): """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords""" if tags: # remove tags text = strip_tags(text) if remove_digits: # tokenize and remove digits-only tokens text = [ token.text for token in self.tokenizer(text) if not self.only_digits(token.text) ] else: # tokenize and keep digits-only tokens text = [token.text for token in self.tokenizer(text)] # return preprocessed doc return text
def clean_text(text): """ Cleans the text in the only argument in various steps ARGUMENTS: text: content/title, string RETURNS: cleaned text, string""" # Replace newlines by space. We want only one doc vector. text = text.replace('\n', ' ').lower() # Remove URLs text = re.sub(r"http\S+", "", text) # Expand contractions: you're to you are and so on. text = contractions.fix(text) # Remove stop words text = preprocessing.remove_stopwords(text) # Remove html tags and numbers: can numbers possible be useful? text = preprocessing.strip_tags(text) # Remove punctuation -- all special characters text = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(text)) return text
def word_tokenize(text): try: if (isinstance(text, str)): words = text.lower().split() else: words = str(text).lower().split() if len(words) == 0: return '' text = ' '.join(words) text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) return text.encode('utf-8') except UnicodeDecodeError as e: return ''
def gensim_clean_string(textIn, _strip_tags=True, _split_alphanumeric=True, _strip_nonalphanumeric=True, _strip_muliple_whitespace=True, _strip_short=True, _short_charcount_min=3, _strip_punctuation=False, _convert_to_lower = False): cleaner = textIn if _strip_tags: cleaner = strip_tags(textIn) if _strip_nonalphanumeric: cleaner = strip_non_alphanum(cleaner) if _strip_muliple_whitespace: cleaner = strip_multiple_whitespaces(cleaner) if _split_alphanumeric: cleaner = split_alphanum(cleaner) if _strip_short: cleaner = strip_short(cleaner, minsize=_short_charcount_min) if _convert_to_lower: cleaner = cleaner.lower() return cleaner
def get_magid_from_annotation(matchobject): """ This takes a found dblp annotation, scrapes the website and gets the title, uses this to query mag, returns a mag id with doc id prefix and suffix""" cited_dblpurl = matchobject.group(2) try: res = requests.get(cited_dblpurl, headers=headers) # the xml is after 'export record' res = res.text[res.text.find('export record'):] xml_url_matchobj = xml_p.search(res) if xml_url_matchobj is None: return 'citation' xml_url = xml_url_matchobj.group(2) sleep(1) print(xml_url) xml_res = requests.get(xml_url, headers=headers) soup = BeautifulSoup(xml_res.content, 'lxml') title_bs4tag = soup.find('title') # If it can't find the title for whatever reason, remove the citation if title_bs4tag is None: return 'citation' title = title_bs4tag.string if title is None: return 'citation' title = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation( preprocessing.strip_tags(title.lower()))).strip() pcur.execute(query2, (title, )) resultset = pcur.fetchone() if resultset is None: # If the uuid does not map to a mag id, replace with the word citation. #wordindex_magid_dict[i] = 'citation' print('not found') return 'citation' else: #print(resultset) fetched_magid = resultset['paperid'] allmagpaperids.add(fetched_magid) return '{}{}{}'.format(docid_prefix, fetched_magid, docid_suffix) except requests.exceptions.MissingSchema: # for GC annotations return 'citation'
def map_dblp_to_mag_requests(dblp_url): """ Takes a dblp url, gets the title by scraping the website and getting the relevant xml file, and using that to get the title. This title is used to map to MAG.""" try: res = requests.get(dblp_url, headers=headers) # the xml is after 'export record' res = res.text[res.text.find('export record'):] xml_url_matchobj = xml_p.search(res) if xml_url_matchobj is None: return None xml_url = xml_url_matchobj.group(2) sleep(1) print(xml_url) xml_res = requests.get(xml_url, headers=headers) soup = BeautifulSoup(xml_res.content, 'lxml') title_bs4tag = soup.find('title') # If it can't find the title for whatever reason, remove the citation if title_bs4tag is None: return None title = title_bs4tag.string if title is None: return None title = preprocessing.strip_multiple_whitespaces( preprocessing.strip_punctuation(preprocessing.strip_tags(title.lower()))).strip() pcur.execute(query2, (title,)) resultset = pcur.fetchone() if resultset is None: # If the uuid does not map to a mag id, replace with the word citation. #wordindex_magid_dict[i] = 'citation' print('not found') return None else: #print(resultset) fetched_magid = resultset['paperid'] #writer.writerow({'dblp_url': dblp_url, 'mag_id': fetched_magid}) #allmagpaperids.add(fetched_magid) return fetched_magid except requests.exceptions.MissingSchema: # for GC annotations return None
def word_tokenize_and_remove_stop_words(text, stop_word1, stop_word2): try: if isinstance(text, str): words = text.lower().split() else: words = str(text).lower().split() if len(words) == 0: return '' text = ' '.join(filter(lambda x: x not in stop_word1, words)) text = preprocessing.strip_punctuation(text) text = preprocessing.strip_non_alphanum(text) text = preprocessing.strip_numeric(text) text = preprocessing.strip_tags(text) text = preprocessing.strip_multiple_whitespaces(text) words = text.split() if len(words) == 0: return '' text = ' '.join(filter(lambda x: x not in stop_word2, words)) return text.encode('utf-8') except UnicodeDecodeError as e: return ''
def sentences_polishing(words_lst, what, deep_polishing=False): # calculating char numers for entire review list lst_len_start = sum(len(s) for s in words_lst) print("Cleaning for list with " + str(lst_len_start) + " chars, for " + what) # deleting html tags words_lst = [strip_tags(x) for x in words_lst] # deleting punctuation words_lst = [strip_punctuation2(x) for x in words_lst] if deep_polishing: # Initializing pool for multiprocessing pool = Pool(processes=10) # for every review, apply function and save result words_lst = pool.map(stopWords, words_lst) pool.close() pool.join() # deleting empty reviews words_lst = [x for x in words_lst if x] # recalculating list char and printing results lst_len_end = sum(len(s) for s in words_lst) cleaned = lst_len_start - lst_len_end print("Deleted " + str(cleaned) + " (" + str(int(cleaned / lst_len_start * 100)) + "%) chars, for " + what + "\n") # freeing memory gc.collect() return words_lst
def Train_preprocess(yelp_round): input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './swe_train_rd%d.txt' % (yelp_round) fin = open(input_file, 'rb') fo = open(output_file, 'wb') user_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: fo.write('\n') else: start = 0 user_id = s.strip('\n').split() if len(user_id) < 1: print "there is no user_id following the start_mark!" fo.write(user_id[0] + ' ') s = '' if len(user_id) <= 1: continue else: for i in range(len(user_id) - 1): s = s + user_id[i + 1] + ' ' try: s = s.strip('\n') s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() except UnicodeDecodeError: continue s = '' actual_word_cnt = 0 for ss in s_array: ss = ss.lower() actual_word_cnt = actual_word_cnt + 1 s = s + ss + ' ' if (actual_word_cnt > 0): fo.write(s[:-1]) else: continue fin.close() fo.close() # get user_file and train_file if os.path.isfile('./get_user_train_file') == False: command = 'gcc get_user_file_w2v_train.c -o get_user_file_w2v_train -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result' print command os.system(command) user_file = 'user_file_rd%d.txt' % (yelp_round) w2v_train = './w2v_train_rd%d.txt' % (yelp_round) command = './get_user_file_w2v_train -input %s -user %s -word %s' % ( output_file, user_file, w2v_train) print command os.system(command)
def remove_tags(string_value): """Removes all the tags and markup e.g. <p> </p>.""" return strip_tags(string_value)
dictionary = np.load("dic40.npy") model = Doc2Vec.load('ricardo_col40') bigram_ = Phraser.load('bigrams_40') dic_mapping = np.load('dic_mapping.npy') stop = stopwords.words('english') + list(string.punctuation) input_ = 'We are trying to use Host Migration with online matchmaker.To make it simple now we are using it now with "Show GUI".Basically we added a custom NetworkMigrationManager, where we only overrided OnClientDisconnectedFromHost, where we call the base function and set a flag to disable any message sending after migration (for testing). See the attached file: HostMigration.cs. We start with 3 players, then the server quits, and host migration happens between the 2 remaining machines by using the UI buttons. It seems like that it happens successfully, there will be a new server, and the another client receives this log: NetworkClient Reconnect::ffff:52.28.11.218:5054 UnityEngine.Networking.NetworkMigrationManager:OnGUI(). But when we try to send the first message (through a chat), we get this error: Send command attempted with no client running [client=hostId: 0 connectionId: 1 isReady: False channel count: 2].UnityEngine.Networking.NetworkBehaviour:SendCommandInternal(NetworkWriter, Int32, String) NetworkPlayer:CallCmdServerChatMessage(PlayerId, String) This is the point where we are stuck...We received this Send command attempted with no client running... message all the time when we try to send any message. What could be the problem? ' #input_ = "I've asked repeatedly about this and given often completely incorrect answers from supposed developers. Its both absurd that this wasnt done years ago, and hasnt been done in 5.4 with the editor now supporting retina. Basically, i'm holding off telling users they need to upgrade from osx 10.7 right now as i'm using Unity 5.2, as 5.3/5.4 has zero additional benefit. Lack of Retina is a deal breaker for me and the reason i won't be using Unity in any future projects ore recommending it to anyone. " # #input_ = 'Is there any reason why the same exact scene, with a large realtime spot on the play area, would have much more pixelated hard shadows under Fantastic quality with Very High Resolution shadows when using 5.4.3f1 instead of 5.3.7f1?' #input_ ='Help! Earlier this year, Allegorithmic released Substance Designer 6 (along with Substance Painter 2.5), which added some great new features and enhanced the functionality of their Substance .sbsar files. Unfortunately, these do not appear to work properly in Unity. Aside from Substances created in Substance Designer 6x just not loading, I find that .sbsar files in Unity can be rendered at no higher a resolution than 2048x2048, despite Allegorithmic\'s format supporting higher resolutions. Checking the software manufacturer\'s forums, they say that unfortunately this is entirely in the hands of Unity. So I\'m posting here and asking, when can we hope to have full compatibility and feature support for Allegorithmic .sbsar files?' test = input_.lower() test = pre.strip_punctuation(test) test = pre.strip_tags(test) test = pre.strip_numeric(test) test_final = [i for i in nltk.word_tokenize(test.encode('utf-8')) if i not in stop] #bigrams = ngrams(test_final,2) result_test_bigram = bigram_[test_final] #print (result_test_bigram) list_input = [] #for bi in bigrams: # for word in bi: # list_input.append(word) #print(list_input) #for i in range(10): #print(help(model)) print(result_test_bigram ) #test_vector = model[test_final] vector_test = model.infer_vector(result_test_bigram,steps=10000)
def __init__(self, documents, speed="fast-learn", workers=None): """ Parameters ---------- documents: list of str Input corpus, should be a list of strings. speed: string (optional, default 'fast-learn') This parameter will determine how fast the model takes to train. The fast-learn option is the fastest and will generate the lowest quality vectors. The learn option will learn better quality vectors but take a longer time to train. The deep-learn option will learn the best quality vectors but will take significant time to train. The valid string speed options are: * fast-learn * learn * deep-learn workers: int (optional) The amount of worker threads to be used in training the model. Larger amount will lead to faster training. """ # validate inputs if speed == "fast-learn": hs = 0 negative = 5 epochs = 40 elif speed == "learn": hs = 1 negative = 0 epochs = 40 elif speed == "deep-learn": hs = 1 negative = 0 epochs = 400 else: raise ValueError( "speed parameter needs to be one of: fast-learn, learn or deep-learn" ) if workers is None: pass elif isinstance(workers, int): pass else: raise ValueError("workers needs to be an int") self.documents = list(documents) # preprocess documents for training - tokenize and remove too long/short words train_corpus = [ TaggedDocument(simple_preprocess(strip_tags(doc), deacc=True), [i]) for i, doc in enumerate(documents) ] # create documents and word embeddings with doc2vec if workers is None: self.model = Doc2Vec(documents=train_corpus, vector_size=300, min_count=50, window=15, sample=1e-5, negative=negative, hs=hs, epochs=epochs, dm=0, dbow_words=1) else: self.model = Doc2Vec(documents=train_corpus, vector_size=300, min_count=50, window=15, sample=1e-5, negative=negative, hs=hs, workers=workers, epochs=epochs, dm=0, dbow_words=1) # create 5D embeddings of documents umap_model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine').fit( self.model.docvecs.vectors_docs) # find dense areas of document vectors cluster = hdbscan.HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom').fit( umap_model.embedding_) # calculate topic vectors from dense areas of documents self._create_topic_vectors(cluster.labels_) # deduplicate topics self._deduplicate_topics() # calculate topic sizes and index nearest topic for each document self._calculate_topic_sizes() # find topic words and scores self._find_topic_words_scores()
def strip_html(s): gsp.strip_tags(s) return s
def __call__(self, doc): striped = prep.strip_punctuation(doc) striped = prep.strip_tags(striped) striped = prep.strip_multiple_whitespaces(striped).lower() return striped
def testStripTags(self): self.assertEqual(strip_tags("<i>Hello</i> <b>World</b>!"), "Hello World!")
def NN_preprocess(d_type, yelp_round): # preprocessing for sentiment classification using Deep Neural Network if d_type == 'train': input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './NN_train_rd%d.tmp' % (yelp_round) elif d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = './NN_dev_rd%d.tmp' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = './NN_test_rd%d.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None command = 'java -jar Split_NN.jar %s %s' % (input_file, output_file) print command os.system(command) # remove stop words if d_type == 'train': input_file = './NN_train_rd%d.tmp' % (yelp_round) output_file = './NN_train_rd%d.txt' % (yelp_round) elif d_type == 'dev': input_file = './NN_dev_rd%d.tmp' % (yelp_round) output_file = './NN_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = './NN_test_rd%d.tmp' % (yelp_round) output_file = './NN_test_rd%d.txt' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None stop_file = 'english_stop.txt' fin = open(input_file, 'rb') fs = open(stop_file, "rb") tar_file = open(output_file, 'w+') with open(stop_file, "rb") as f: for i, l in enumerate(f): pass total = i + 1 stop_word1 = ["" for i in range(total)] stop_word2 = ["" for i in range(total)] cnt1 = 0 cnt2 = 0 for l in fs: s = l.strip('\n') if "'" in s: stop_word1[cnt1] = s cnt1 = cnt1 + 1 else: stop_word2[cnt2] = s cnt2 = cnt2 + 1 user_flag = 0 review_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: tar_file.write('\n') else: start = 0 user_star = s.strip('\n').split() if (len(user_star) < 2): print "there is no user_id & star rating following the start_mark!" print len(user_star) for i in range(len(user_star)): print user_star[i] tar_file.write(user_star[0] + '\t\t') tar_file.write(user_star[1] + '\t\t') continue try: s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: ss = ss.lower() if ss not in stop_word1: s = s + ss + ' ' else: continue s = s.strip('\n') s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() s = '' actual_word_cnt = 0 if len(s_array) > 0: for ss in s_array: if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB": # -LCB-, -LRB-, -RCB-, -RRB- continue if ss not in stop_word2: s = s + ss + ' ' actual_word_cnt = actual_word_cnt + 1 if (actual_word_cnt > 0): tar_file.write(s[:-1]) tar_file.write('#') else: continue except UnicodeDecodeError: continue fin.close() tar_file.close() command = 'rm %s' % (input_file) #print command os.system(command)
def test_strip_tags(self): self.assertEqual(strip_tags("<i>Hello</i> <b>World</b>!"), "Hello World!")
def SVM_preprocess(d_type, yelp_round): # preprocessing for sentiment classification using SVM # remove punctuation, tags, multiple spaces, tags, stop words, convert all words into lower case. if d_type == 'train': input_file = 'train_rd%d.tmp' % (yelp_round) output_file = './SVM_train_rd%d.txt' % (yelp_round) elif d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = './SVM_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = './SVM_test_rd%d.txt' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None stop_file = 'english_stop.txt' with open(stop_file, "rb") as f: for i, l in enumerate(f): pass total = i + 1 fin = open(input_file, "rb") fo = open(output_file, "wb") stop_word1 = ["" for i in range(total)] stop_word2 = ["" for i in range(total)] cnt1 = 0 cnt2 = 0 with open(stop_file, "rb") as fs: for l in fs: s = l.strip('\n') if "'" in s: stop_word1[cnt1] = s cnt1 = cnt1 + 1 else: stop_word2[cnt2] = s cnt2 = cnt2 + 1 user_flag = 0 start = 1 begin_mark = str('@@@@@begin_mark@@@@@\n') for s in fin: if s == begin_mark: user_flag = 1 continue if user_flag == 1: user_flag = 0 if start != 1: fo.write('\n') else: start = 0 user_id = s.strip('\n').split() if len(user_id) < 2: print "there is no user_id & star rating following the start_mark!" fo.write(user_id[0] + ' ' + user_id[1] + ' ') s = '' if len(user_id) <= 2: continue else: for i in range(len(user_id) - 2): s = s + user_id[i + 2] + ' ' #s = s[:-1] try: s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: ss = ss.lower() if ss not in stop_word1: s = s + ss + ' ' else: continue s = s.strip('\n') if len(s) > 0: s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() s = '' if len(s_array) > 0: for ss in s_array: if ss not in stop_word2: s = s + ss + ' ' else: continue else: continue if len(s) > 0: if s[-1] != ' ': s = s + ' ' else: continue fo.write(s) except UnicodeDecodeError: continue fin.close() fo.close()
def default_tokenizer(doc): """Tokenize documents for training and remove too long/short words""" return simple_preprocess(strip_tags(doc), deacc=True)
def PPL_preprocess(d_type, yelp_round): if d_type == 'dev': input_file = 'dev_rd%d.tmp' % (yelp_round) output_file = 'PPL_dev_rd%d.tmp' % (yelp_round) elif d_type == 'test': input_file = 'test_rd%d.tmp' % (yelp_round) output_file = 'PPL_test_rd%d.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None command = 'java -jar Split_PPL.jar %s %s' % (input_file, output_file) print command os.system(command) if d_type == 'dev': input_file = 'PPL_dev_rd%d.tmp' % (yelp_round) output_file = 'PPL_dev_rd%d.tmp.tmp' % (yelp_round) elif d_type == 'test': input_file = 'PPL_test_rd%d.tmp' % (yelp_round) output_file = 'PPL_test_rd%d.tmp.tmp' % (yelp_round) else: print 'No such dataset type: %s' % (d_type) return None fin = open(input_file, 'rb') fo = open(output_file, 'wb') for s in fin: user_id = s.strip('\n').split() if len(user_id) <= 1: print "there is no word or only user_id in this line!" continue else: fo.write(user_id[0] + ' ') s = '' for i in range(len(user_id) - 1): s = s + user_id[i + 1] + ' ' s = s[:-1] try: s = preprocessing.strip_punctuation(s) s = preprocessing.strip_non_alphanum(s) s = preprocessing.strip_numeric(s) s = preprocessing.strip_tags(s) s = preprocessing.strip_multiple_whitespaces(s) s_array = s.encode('utf8').split() except UnicodeDecodeError: fo.write('\n') continue s = '' actual_word_cnt = 0 if len(s_array) > 0: for ss in s_array: if ss == "RRB" or ss == "LRB" or ss == "LCB" or ss == "RCB": continue ss = ss.lower() s = s + ss + ' ' actual_word_cnt = actual_word_cnt + 1 if actual_word_cnt > 0: fo.write(s[:-1]) fo.write('\n') fin.close() fo.close() command = 'rm %s' % (input_file) #print command os.system(command) # select a sentence for each user dic = {} lower_bound = 8 upper_bound = 10 if d_type == 'dev': input_file = './PPL_dev_rd%d.tmp.tmp' % (yelp_round) output_file = './PPL_dev_rd%d.txt' % (yelp_round) elif d_type == 'test': input_file = './PPL_test_rd%d.tmp.tmp' % (yelp_round) output_file = './PPL_test_rd%d.txt' % (yelp_round) fo = open(output_file, "wb") user_count = 0 user_file = 'user_file_rd%d.txt' % (yelp_round) with open(user_file, "rb") as fin: for line in fin: user_id = line.strip('\n') if user_id not in dic.keys(): dic[user_id] = user_count user_count = user_count + 1 total = user_count print "total %d user" % (total) recorder = [0 for i in range(total)] with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1) and len(array_line) <= (upper_bound + 1)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 1) and len(array_line) <= (upper_bound + 1 + 1)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 2) and len(array_line) <= (upper_bound + 1 + 2)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: if (len(array_line) >= (lower_bound + 1 - 3) and len(array_line) <= (upper_bound + 1 + 3)): fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: with open(input_file, "rb") as fin: for i, line in enumerate(fin): array_line = line.strip('\n').split() if array_line[0] == "unknown_user_id": pass else: if recorder[dic[array_line[0]]] != 0: pass else: fo.write(line.strip('\n')) fo.write('\n') recorder[dic[array_line[0]]] = 1 go_on = 0 count = 0 for i in range(total): if recorder[i] == 0: go_on = 1 count = count + 1 if go_on == 1: print "ERROR" fo.close() command = 'rm %s' % (input_file) #print command os.system(command)