def __init__(self, annotation_file_path, train_image_folder_path): self.annotation_file = annotation_file_path self.PATH = train_image_folder_path train_captions, img_path_vector = deal_annotations( self.annotation_file, self.PATH) image_preprocess(img_path_vector) caption_vector, self.max_length, self.tokenizer = text_preprocess( train_captions, vocab_size=settings.vocab_size) dataset = build_dataset(img_path_vector, caption_vector) self.dataset = dataset self.total_num = len(img_path_vector)
import numpy as np from gensim.models import Word2Vec import nltk from nltk.tokenize import word_tokenize this_file_path = os.path.abspath(__file__) folder_root = os.path.split(this_file_path)[0] repo_root = os.path.split(folder_root)[0] sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import preprocess as pr path_corpus = os.path.join(repo_root, "orig_text_data/NI_docs/") ## Load txt document file names ocr_text = pr.text_preprocess(path_corpus) ocr_text.files #6946 # run through function to gather all text (as dictionary) ocr_text_corpus = ocr_text.nvivo_ocr() # Convert to Dataframe ocr_corpus = pd.DataFrame(ocr_text_corpus.items()) ocr_corpus.columns = ['img_file', 'raw_text'] # Subset to pages that contain a justification df = pd.read_csv(os.path.join(repo_root, 'justifications_clean_text_ohe.csv')) just_imgs = np.ndarray.tolist(df['img_file_orig'].unique()) ocr_corpus_subset = ocr_corpus.loc[ocr_corpus['img_file'].isin(just_imgs)] # Define whether you want to use whole corpus or subset to text w justifications
folder_path = os.path.join(folder_root) + '/' repo_root = os.path.split(folder_root)[0] j_path = os.path.join(repo_root, 'orig_text_data/just_0404') + '/' #j_path = os.path.join(repo_root, 'orig_text_data/just_icr') + '/' #For ICR task sys.path # make sure the repo is in the path sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import preprocess as pr ################################################ ##### 1) COMPILE JUSTIFICATIONS FROM NVIVO ##### ################################################ ## Load txt document raw = pr.text_preprocess(j_path) raw.files cat_text = raw.nvivo_clumps() for key, value in cat_text.items(): print(key) # Create df this will create a df with lists where each row is the justification # category (12) followed by a list of the individual "files" within the category temp_df = pd.DataFrame.from_dict(cat_text, orient='index') # What we want is a column for the category repeated by each entry of the list # which will becore rows df_long = temp_df.stack().reset_index(level=1, drop=True).to_frame(name='raw_text')
''' Main function: 1. Reads Json file, Processes it and converts to dataframe. 2. Perform Text cleaning ( remove spaces, punctuations(etc), short words, lower-casing) 3. Does Text Preparation ( stop-words(remove), stemming and lemmatization ) 4. Apply Latent Sematic Analysis(LSA) for feature extraction 5. Classification task based on the features from LSA into 2 classes, Patent Granted and Patent Not-Granted. ''' if __name__ == "__main__": #convert json to dataframe df = json_to_df('uspto.json') #preprocess text from the summary column of the dataframe tp = text_preprocess(df) tp.text_cleaning() df = tp.prepare_text() #Latent Semantic Analysis for feature extraction lsa = feature_extraction(df) dtm, vectorizer = lsa.document_term_matrix() lsa.topic_model(dtm,vectorizer) #Train Model for classification X_train, X_test, y_train, y_test = train_test_split(dtm, df['Decision'].astype('int'), test_size=0.20, random_state=0) clf = classify(X_train, X_test, y_train, y_test) clf.make_classification()
import pandas as pd import numpy as np sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import preprocess as pr ## Set project path this_file_path = os.path.abspath(__file__) folder_root = os.path.split(this_file_path)[0] repo_root = os.path.split(folder_root)[0] all_path = os.path.join(repo_root, "orig_text_data/NI_docs/") #sys.path # make sure the repo is in the path ## Load txt documents (all) raw = pr.text_preprocess(all_path) raw.files ## Randomly draw 10 documents random_draw = np.random.choice(raw.files, 10) ## Loop through 10 documents to count characters (not including spaces) words2={} characters2={} characters3=[] for f in random_draw: # Extract text and parse into df with open(f) as text: words = 0 characters = 0